From 7b7eef29222869a1248f113131a158397fb3f031 Mon Sep 17 00:00:00 2001 From: gabriel-suela Date: Wed, 15 Feb 2023 19:10:33 -0300 Subject: [PATCH 1/6] feat: export as parquet [WIP] --- .../QueryControlDropdown.jsx | 11 +++ .../dashboard-widget/VisualizationWidget.jsx | 9 +++ .../app/pages/queries/VisualizationEmbed.jsx | 11 +++ redash/handlers/query_results.py | 43 ++++++++---- redash/serializers/__init__.py | 1 + redash/serializers/query_result.py | 69 +++++++++++++++++-- 6 files changed, 124 insertions(+), 20 deletions(-) diff --git a/client/app/components/EditVisualizationButton/QueryControlDropdown.jsx b/client/app/components/EditVisualizationButton/QueryControlDropdown.jsx index 89ce988fd3..617482e8ab 100644 --- a/client/app/components/EditVisualizationButton/QueryControlDropdown.jsx +++ b/client/app/components/EditVisualizationButton/QueryControlDropdown.jsx @@ -66,6 +66,17 @@ export default function QueryControlDropdown(props) { Download as Excel File + + + Download as Parquet File + + ); diff --git a/client/app/components/dashboards/dashboard-widget/VisualizationWidget.jsx b/client/app/components/dashboards/dashboard-widget/VisualizationWidget.jsx index 9a021cc8bd..9ea4f7d108 100644 --- a/client/app/components/dashboards/dashboard-widget/VisualizationWidget.jsx +++ b/client/app/components/dashboards/dashboard-widget/VisualizationWidget.jsx @@ -58,6 +58,15 @@ function visualizationWidgetMenuOptions({ widget, canEditDashboard, onParameters "Download as Excel File" )} , + + {!isQueryResultEmpty ? ( + + Download as Parquet File + + ) : ( + "Download as Parquet File" + )} + , (canViewQuery || canEditParameters) && , canViewQuery && ( diff --git a/client/app/pages/queries/VisualizationEmbed.jsx b/client/app/pages/queries/VisualizationEmbed.jsx index a4bcaf3177..a446f162f6 100644 --- a/client/app/pages/queries/VisualizationEmbed.jsx +++ b/client/app/pages/queries/VisualizationEmbed.jsx @@ -100,6 +100,17 @@ function VisualizationEmbedFooter({ Download as Excel File + + + Download as Parquet File + + ); diff --git a/redash/handlers/query_results.py b/redash/handlers/query_results.py index cb8b8d14f2..7f590ba934 100644 --- a/redash/handlers/query_results.py +++ b/redash/handlers/query_results.py @@ -1,40 +1,42 @@ import logging import time - import unicodedata + from flask import make_response, request from flask_login import current_user from flask_restful import abort from werkzeug.urls import url_quote + from redash import models, settings from redash.handlers.base import BaseResource, get_object_or_404, record_event +from redash.models.parameterized_query import ( + InvalidParameterError, + ParameterizedQuery, + QueryDetachedFromDataSourceError, + dropdown_values, +) from redash.permissions import ( has_access, not_view_only, require_access, - require_permission, require_any_of_permission, + require_permission, view_only, ) +from redash.serializers import ( + serialize_job, + serialize_query_result, + serialize_query_result_to_dsv, + serialize_query_result_to_parquet, + serialize_query_result_to_xlsx, +) from redash.tasks import Job from redash.tasks.queries import enqueue_query from redash.utils import ( collect_parameters_from_request, json_dumps, - utcnow, to_filename, -) -from redash.models.parameterized_query import ( - ParameterizedQuery, - InvalidParameterError, - QueryDetachedFromDataSourceError, - dropdown_values, -) -from redash.serializers import ( - serialize_query_result, - serialize_query_result_to_dsv, - serialize_query_result_to_xlsx, - serialize_job, + utcnow, ) @@ -402,6 +404,7 @@ def get(self, query_id=None, query_result_id=None, filetype="json"): "xlsx": self.make_excel_response, "csv": self.make_csv_response, "tsv": self.make_tsv_response, + "parquet": self.make_parquet_response, } response = response_builders[filetype](query_result) @@ -450,6 +453,16 @@ def make_excel_response(query_result): } return make_response(serialize_query_result_to_xlsx(query_result), 200, headers) + @staticmethod + def make_parquet_response(query_result): + headers = { + # https://issues.apache.org/jira/browse/PARQUET-1889 + "Content-Type": "application/parquet", + } + return make_response( + serialize_query_result_to_parquet(query_result), 200, headers + ) + class JobResource(BaseResource): def get(self, job_id, query_id=None): diff --git a/redash/serializers/__init__.py b/redash/serializers/__init__.py index 6105364c49..4c9b91a8d1 100644 --- a/redash/serializers/__init__.py +++ b/redash/serializers/__init__.py @@ -19,6 +19,7 @@ serialize_query_result, serialize_query_result_to_dsv, serialize_query_result_to_xlsx, + serialize_query_result_to_parquet, ) diff --git a/redash/serializers/query_result.py b/redash/serializers/query_result.py index 9eab2a1a42..3d9405c8be 100644 --- a/redash/serializers/query_result.py +++ b/redash/serializers/query_result.py @@ -1,11 +1,22 @@ -import io import csv +import io + +import pyarrow +import pyarrow.parquet import xlsxwriter -from funcy import rpartial, project from dateutil.parser import isoparse as parse_date -from redash.utils import json_loads, UnicodeWriter -from redash.query_runner import TYPE_BOOLEAN, TYPE_DATE, TYPE_DATETIME +from funcy import project, rpartial + from redash.authentication.org_resolving import current_org +from redash.query_runner import ( + TYPE_BOOLEAN, + TYPE_DATE, + TYPE_DATETIME, + TYPE_FLOAT, + TYPE_INTEGER, + TYPE_STRING, +) +from redash.utils import UnicodeWriter, json_loads def _convert_format(fmt): @@ -86,7 +97,9 @@ def serialize_query_result_to_dsv(query_result, delimiter): fieldnames, special_columns = _get_column_lists(query_data["columns"] or []) - writer = csv.DictWriter(s, extrasaction="ignore", fieldnames=fieldnames, delimiter=delimiter) + writer = csv.DictWriter( + s, extrasaction="ignore", fieldnames=fieldnames, delimiter=delimiter + ) writer.writeheader() for row in query_data["rows"]: @@ -121,3 +134,49 @@ def serialize_query_result_to_xlsx(query_result): book.close() return output.getvalue() + + +def serialize_query_result_to_parquet(query_result): + output = io.BytesIO() + query_data = query_result.data + conversions = [ + {"pandas_type": pyarrow.bool_, "redash_type": TYPE_BOOLEAN}, + { + # "pyarrow_type": pyarrow.date64, + "pyarrow_type": pyarrow.string, + "redash_type": TYPE_DATE, + # "to_redash": lambda x: x.strftime("%Y-%m-%d %H:%M:%S"), + # "to_pyarrow": lambda x: x, + }, + { + # "pyarrow_type": pyarrow.timestamp, + "pyarrow_type": pyarrow.string, + "redash_type": TYPE_DATETIME, + # "to_redash": lambda x: x.strftime("%Y-%m-%d %H:%M:%S"), + # "to_pyarrow": lambda x: x, + }, + {"pyarrow_type": pyarrow.float64, "redash_type": TYPE_FLOAT}, + {"pyarrow_type": pyarrow.int64, "redash_type": TYPE_INTEGER}, + {"pyarrow_type": pyarrow.string, "redash_type": TYPE_STRING}, + ] + + fields = [] + + for column in query_data["columns"]: + for conversion in conversions: + if column["type"] == conversion["redash_type"]: + fields.append(pyarrow.field(column["name"], conversion["pyarrow_type"])) + break + + table = pyarrow.Table.from_pylist(query_data["rows"]) + print(table) + with pyarrow.parquet.ParquetWriter( + where=output, + schema=pyarrow.schema( + fields, + # metadata={"friendly_name": "id"}, + ), + ) as writer: + writer.write_table(table) + + return output.getvalue() From 5a9074014d23872ac31ae68a8d5da1fb9563f3cf Mon Sep 17 00:00:00 2001 From: gabriel-suela Date: Fri, 3 Mar 2023 13:51:57 -0300 Subject: [PATCH 2/6] fix: serialize querry result to parquet --- .../QueryControlDropdown.jsx | 2 +- redash/handlers/query_results.py | 2 +- redash/serializers/query_result.py | 66 +++++++++++++------ requirements_all_ds.txt | 2 +- 4 files changed, 49 insertions(+), 23 deletions(-) diff --git a/client/app/components/EditVisualizationButton/QueryControlDropdown.jsx b/client/app/components/EditVisualizationButton/QueryControlDropdown.jsx index 617482e8ab..f3e9bb70d9 100644 --- a/client/app/components/EditVisualizationButton/QueryControlDropdown.jsx +++ b/client/app/components/EditVisualizationButton/QueryControlDropdown.jsx @@ -74,7 +74,7 @@ export default function QueryControlDropdown(props) { queryResult={props.queryResult} embed={props.embed} apiKey={props.apiKey}> - Download as Parquet File + Download as Parquet File diff --git a/redash/handlers/query_results.py b/redash/handlers/query_results.py index 7f590ba934..570e533822 100644 --- a/redash/handlers/query_results.py +++ b/redash/handlers/query_results.py @@ -457,7 +457,7 @@ def make_excel_response(query_result): def make_parquet_response(query_result): headers = { # https://issues.apache.org/jira/browse/PARQUET-1889 - "Content-Type": "application/parquet", + # "Content-Type": "application/parquet" } return make_response( serialize_query_result_to_parquet(query_result), 200, headers diff --git a/redash/serializers/query_result.py b/redash/serializers/query_result.py index 3d9405c8be..e38bc9dd33 100644 --- a/redash/serializers/query_result.py +++ b/redash/serializers/query_result.py @@ -1,7 +1,9 @@ import csv import io +from typing import Optional import pyarrow +import pyarrow.compute import pyarrow.parquet import xlsxwriter from dateutil.parser import isoparse as parse_date @@ -18,6 +20,8 @@ ) from redash.utils import UnicodeWriter, json_loads +logging.getLogger(__name__) + def _convert_format(fmt): return ( @@ -139,43 +143,65 @@ def serialize_query_result_to_xlsx(query_result): def serialize_query_result_to_parquet(query_result): output = io.BytesIO() query_data = query_result.data + + def redash_datetime_to_pyarrow_timestamp( + table: "pyarrow.Table", + field: "pyarrow.Field", + conversion: Optional[dict] = None, + ) -> "pyarrow.Table": + column_index: int = table.schema.get_field_index(field.name) + column_data = pyarrow.compute.strptime( + table.column(column_index), + format=conversion["redash_format"], + unit="s", + ) + new_table = table.set_column(column_index, field.name, column_data) + return new_table + conversions = [ - {"pandas_type": pyarrow.bool_, "redash_type": TYPE_BOOLEAN}, + {"pyarrow_type": pyarrow.bool_(), "redash_type": TYPE_BOOLEAN}, { - # "pyarrow_type": pyarrow.date64, - "pyarrow_type": pyarrow.string, + "pyarrow_type": pyarrow.date32(), "redash_type": TYPE_DATE, - # "to_redash": lambda x: x.strftime("%Y-%m-%d %H:%M:%S"), - # "to_pyarrow": lambda x: x, + "redash_format": r"%Y-%m-%d", + "redash_to_pyarrow": redash_datetime_to_pyarrow_timestamp, }, { - # "pyarrow_type": pyarrow.timestamp, - "pyarrow_type": pyarrow.string, + "pyarrow_type": pyarrow.timestamp("s"), "redash_type": TYPE_DATETIME, - # "to_redash": lambda x: x.strftime("%Y-%m-%d %H:%M:%S"), - # "to_pyarrow": lambda x: x, + "redash_format": r"%Y-%m-%d %H:%M:%S", + "redash_to_pyarrow": redash_datetime_to_pyarrow_timestamp, }, - {"pyarrow_type": pyarrow.float64, "redash_type": TYPE_FLOAT}, - {"pyarrow_type": pyarrow.int64, "redash_type": TYPE_INTEGER}, - {"pyarrow_type": pyarrow.string, "redash_type": TYPE_STRING}, + {"pyarrow_type": pyarrow.float64(), "redash_type": TYPE_FLOAT}, + {"pyarrow_type": pyarrow.int64(), "redash_type": TYPE_INTEGER}, + {"pyarrow_type": pyarrow.string(), "redash_type": TYPE_STRING}, ] + table = pyarrow.Table.from_pylist(query_data["rows"]) fields = [] for column in query_data["columns"]: for conversion in conversions: if column["type"] == conversion["redash_type"]: - fields.append(pyarrow.field(column["name"], conversion["pyarrow_type"])) + field = pyarrow.field( + name=column["name"], + type=conversion["pyarrow_type"], + metadata={"friendly_name": column["friendly_name"]}, + ) + fields.append(field) + converter = conversion.get("redash_to_pyarrow") + if converter: + table = converter( + table=table, + field=field, + conversion=conversion, + ) break - - table = pyarrow.Table.from_pylist(query_data["rows"]) - print(table) + target_schema = pyarrow.schema(fields) + table = table.cast(target_schema=target_schema) with pyarrow.parquet.ParquetWriter( where=output, - schema=pyarrow.schema( - fields, - # metadata={"friendly_name": "id"}, - ), + schema=target_schema, ) as writer: writer.write_table(table) diff --git a/requirements_all_ds.txt b/requirements_all_ds.txt index 3b1eb21a64..2c66031445 100644 --- a/requirements_all_ds.txt +++ b/requirements_all_ds.txt @@ -47,4 +47,4 @@ nzpy>=1.15 nzalchemy python-arango==6.1.0 pinotdb>=0.4.5 -pyarrow==10.0.0 \ No newline at end of file +pyarrow==10.0.0 From 831d7e9f200d702f94ae4a24769cb368aeb61c88 Mon Sep 17 00:00:00 2001 From: Lucas Fernando Cardoso Nunes Date: Mon, 27 Mar 2023 22:05:29 -0300 Subject: [PATCH 3/6] fix: --- redash/serializers/query_result.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/redash/serializers/query_result.py b/redash/serializers/query_result.py index e38bc9dd33..dd766ad35b 100644 --- a/redash/serializers/query_result.py +++ b/redash/serializers/query_result.py @@ -20,8 +20,6 @@ ) from redash.utils import UnicodeWriter, json_loads -logging.getLogger(__name__) - def _convert_format(fmt): return ( From a907c6c13f26d78225b6e453fb44eed42050455f Mon Sep 17 00:00:00 2001 From: Lucas Fernando Cardoso Nunes Date: Tue, 28 Mar 2023 11:42:50 -0300 Subject: [PATCH 4/6] fix: to parquet redash datetime format --- redash/serializers/query_result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/redash/serializers/query_result.py b/redash/serializers/query_result.py index dd766ad35b..74f3e6ee6a 100644 --- a/redash/serializers/query_result.py +++ b/redash/serializers/query_result.py @@ -167,7 +167,7 @@ def redash_datetime_to_pyarrow_timestamp( { "pyarrow_type": pyarrow.timestamp("s"), "redash_type": TYPE_DATETIME, - "redash_format": r"%Y-%m-%d %H:%M:%S", + "redash_format": r"%Y-%m-%dT%H:%M:%S", "redash_to_pyarrow": redash_datetime_to_pyarrow_timestamp, }, {"pyarrow_type": pyarrow.float64(), "redash_type": TYPE_FLOAT}, From 507e9e0d4633471ca4dec57f92370b15c7209bce Mon Sep 17 00:00:00 2001 From: gabriel-suela Date: Thu, 29 Jun 2023 18:10:04 -0300 Subject: [PATCH 5/6] fix: datetime handle on export to parquet --- redash/serializers/query_result.py | 34 +++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/redash/serializers/query_result.py b/redash/serializers/query_result.py index 74f3e6ee6a..6bdd26633e 100644 --- a/redash/serializers/query_result.py +++ b/redash/serializers/query_result.py @@ -137,7 +137,6 @@ def serialize_query_result_to_xlsx(query_result): return output.getvalue() - def serialize_query_result_to_parquet(query_result): output = io.BytesIO() query_data = query_result.data @@ -148,11 +147,20 @@ def redash_datetime_to_pyarrow_timestamp( conversion: Optional[dict] = None, ) -> "pyarrow.Table": column_index: int = table.schema.get_field_index(field.name) - column_data = pyarrow.compute.strptime( - table.column(column_index), - format=conversion["redash_format"], - unit="s", - ) + column_data = table.column(column_index) + + formats = conversion["redash_formats"] + for datetime_format in formats: + try: + column_data = pyarrow.compute.strptime( + column_data, + format=datetime_format, + unit="s", + ) + break + except pyarrow.lib.ArrowInvalid: + continue + new_table = table.set_column(column_index, field.name, column_data) return new_table @@ -161,13 +169,13 @@ def redash_datetime_to_pyarrow_timestamp( { "pyarrow_type": pyarrow.date32(), "redash_type": TYPE_DATE, - "redash_format": r"%Y-%m-%d", + "redash_formats": [r"%Y-%m-%d"], "redash_to_pyarrow": redash_datetime_to_pyarrow_timestamp, }, { "pyarrow_type": pyarrow.timestamp("s"), "redash_type": TYPE_DATETIME, - "redash_format": r"%Y-%m-%dT%H:%M:%S", + "redash_formats": [r"%Y-%m-%dT%H:%M:%S", r"%Y-%m-%d %H:%M:%S"], "redash_to_pyarrow": redash_datetime_to_pyarrow_timestamp, }, {"pyarrow_type": pyarrow.float64(), "redash_type": TYPE_FLOAT}, @@ -195,8 +203,10 @@ def redash_datetime_to_pyarrow_timestamp( conversion=conversion, ) break + target_schema = pyarrow.schema(fields) table = table.cast(target_schema=target_schema) + with pyarrow.parquet.ParquetWriter( where=output, schema=target_schema, @@ -204,3 +214,11 @@ def redash_datetime_to_pyarrow_timestamp( writer.write_table(table) return output.getvalue() + + + + + + + + From 46608f08fa782bbb99e3ba1209c42c7b045a5937 Mon Sep 17 00:00:00 2001 From: gabriel-suela Date: Tue, 16 Apr 2024 15:27:38 -0300 Subject: [PATCH 6/6] style: misc --- .../dashboard-widget/VisualizationWidget.jsx | 18 ++++++++--------- redash/handlers/query_results.py | 20 ++++++++++--------- redash/serializers/__init__.py | 10 +++++++++- redash/serializers/query_result.py | 11 ++-------- 4 files changed, 31 insertions(+), 28 deletions(-) diff --git a/client/app/components/dashboards/dashboard-widget/VisualizationWidget.jsx b/client/app/components/dashboards/dashboard-widget/VisualizationWidget.jsx index 9ea4f7d108..0807c4ef4a 100644 --- a/client/app/components/dashboards/dashboard-widget/VisualizationWidget.jsx +++ b/client/app/components/dashboards/dashboard-widget/VisualizationWidget.jsx @@ -58,15 +58,15 @@ function visualizationWidgetMenuOptions({ widget, canEditDashboard, onParameters "Download as Excel File" )} , - - {!isQueryResultEmpty ? ( - - Download as Parquet File - - ) : ( - "Download as Parquet File" - )} - , + + {!isQueryResultEmpty ? ( + + Download as Parquet File + + ) : ( + "Download as Parquet File" + )} + , (canViewQuery || canEditParameters) && , canViewQuery && ( diff --git a/redash/handlers/query_results.py b/redash/handlers/query_results.py index 570e533822..9b426c8065 100644 --- a/redash/handlers/query_results.py +++ b/redash/handlers/query_results.py @@ -121,9 +121,11 @@ def run_query( current_user.id, current_user.is_api_user(), metadata={ - "Username": repr(current_user) - if current_user.is_api_user() - else current_user.email, + "Username": ( + repr(current_user) + if current_user.is_api_user() + else current_user.email + ), "query_id": query_id, }, ) @@ -262,14 +264,14 @@ def options(self, query_id=None, query_result_id=None, filetype="json"): self.add_cors_headers(headers) if settings.ACCESS_CONTROL_REQUEST_METHOD: - headers[ - "Access-Control-Request-Method" - ] = settings.ACCESS_CONTROL_REQUEST_METHOD + headers["Access-Control-Request-Method"] = ( + settings.ACCESS_CONTROL_REQUEST_METHOD + ) if settings.ACCESS_CONTROL_ALLOW_HEADERS: - headers[ - "Access-Control-Allow-Headers" - ] = settings.ACCESS_CONTROL_ALLOW_HEADERS + headers["Access-Control-Allow-Headers"] = ( + settings.ACCESS_CONTROL_ALLOW_HEADERS + ) return make_response("", 200, headers) diff --git a/redash/serializers/__init__.py b/redash/serializers/__init__.py index 4c9b91a8d1..b2f1de5294 100644 --- a/redash/serializers/__init__.py +++ b/redash/serializers/__init__.py @@ -3,6 +3,7 @@ classes we have. This will ensure cleaner code and better separation of concerns. """ + from funcy import project from flask_login import current_user @@ -56,7 +57,14 @@ def public_widget(widget): def public_dashboard(dashboard): dashboard_dict = project( serialize_dashboard(dashboard, with_favorite_state=False), - ("name", "layout", "dashboard_filters_enabled", "updated_at", "created_at", "options"), + ( + "name", + "layout", + "dashboard_filters_enabled", + "updated_at", + "created_at", + "options", + ), ) widget_list = ( diff --git a/redash/serializers/query_result.py b/redash/serializers/query_result.py index 6bdd26633e..6e087e9d6f 100644 --- a/redash/serializers/query_result.py +++ b/redash/serializers/query_result.py @@ -137,6 +137,7 @@ def serialize_query_result_to_xlsx(query_result): return output.getvalue() + def serialize_query_result_to_parquet(query_result): output = io.BytesIO() query_data = query_result.data @@ -148,7 +149,7 @@ def redash_datetime_to_pyarrow_timestamp( ) -> "pyarrow.Table": column_index: int = table.schema.get_field_index(field.name) column_data = table.column(column_index) - + formats = conversion["redash_formats"] for datetime_format in formats: try: @@ -214,11 +215,3 @@ def redash_datetime_to_pyarrow_timestamp( writer.write_table(table) return output.getvalue() - - - - - - - -