From 39cda68ab701b2cf9eee87991011e62736155c07 Mon Sep 17 00:00:00 2001 From: peacewong Date: Fri, 29 Dec 2023 16:56:34 +0800 Subject: [PATCH] remove feature doc --- docs/feature/_category_.json | 4 - docs/feature/base-engine-compatibilty.md | 74 -- docs/feature/datasource-generate-sql.md | 149 ---- .../feature/hive-engine-support-concurrent.md | 24 - docs/feature/other.md | 28 - docs/feature/overview.md | 33 - docs/feature/spark-etl.md | 642 ------------------ .../current/feature/_category_.json | 4 - .../feature/base-engine-compatibilty.md | 75 -- .../feature/datasource-generate-sql.md | 149 ---- .../feature/hive-engine-support-concurrent.md | 24 - .../current/feature/other.md | 29 - .../current/feature/overview.md | 33 - .../current/feature/spark-etl.md | 642 ------------------ .../version-1.5.0/feature/_category_.json | 4 - .../feature/base-engine-compatibilty.md | 75 -- .../feature/datasource-generate-sql.md | 149 ---- .../feature/hive-engine-support-concurrent.md | 24 - .../version-1.5.0/feature/other.md | 29 - .../version-1.5.0/feature/overview.md | 33 - .../version-1.5.0/feature/spark-etl.md | 642 ------------------ .../version-1.5.0/feature/_category_.json | 4 - .../feature/base-engine-compatibilty.md | 74 -- .../feature/datasource-generate-sql.md | 149 ---- .../feature/hive-engine-support-concurrent.md | 24 - versioned_docs/version-1.5.0/feature/other.md | 28 - .../version-1.5.0/feature/overview.md | 33 - .../version-1.5.0/feature/spark-etl.md | 642 ------------------ 28 files changed, 3820 deletions(-) delete mode 100644 docs/feature/_category_.json delete mode 100644 docs/feature/base-engine-compatibilty.md delete mode 100644 docs/feature/datasource-generate-sql.md delete mode 100644 docs/feature/hive-engine-support-concurrent.md delete mode 100644 docs/feature/other.md delete mode 100644 docs/feature/overview.md delete mode 100644 docs/feature/spark-etl.md delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/_category_.json delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/base-engine-compatibilty.md delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/datasource-generate-sql.md delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/hive-engine-support-concurrent.md delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/other.md delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/overview.md delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/spark-etl.md delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/_category_.json delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/base-engine-compatibilty.md delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/datasource-generate-sql.md delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/hive-engine-support-concurrent.md delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/other.md delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/overview.md delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/spark-etl.md delete mode 100644 versioned_docs/version-1.5.0/feature/_category_.json delete mode 100644 versioned_docs/version-1.5.0/feature/base-engine-compatibilty.md delete mode 100644 versioned_docs/version-1.5.0/feature/datasource-generate-sql.md delete mode 100644 versioned_docs/version-1.5.0/feature/hive-engine-support-concurrent.md delete mode 100644 versioned_docs/version-1.5.0/feature/other.md delete mode 100644 versioned_docs/version-1.5.0/feature/overview.md delete mode 100644 versioned_docs/version-1.5.0/feature/spark-etl.md diff --git a/docs/feature/_category_.json b/docs/feature/_category_.json deleted file mode 100644 index eb7c770c8e5..00000000000 --- a/docs/feature/_category_.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "label": "Version Feature", - "position": 1.5 -} \ No newline at end of file diff --git a/docs/feature/base-engine-compatibilty.md b/docs/feature/base-engine-compatibilty.md deleted file mode 100644 index 81062d30c43..00000000000 --- a/docs/feature/base-engine-compatibilty.md +++ /dev/null @@ -1,74 +0,0 @@ ---- -title: Base Engine Dependency, Compatibility, Default Version Optimization -sidebar_position: 0.2 ---- - -## 1. Requirement background -1. The lower version of linkis needs to modify the code to adapt to different versions of Hive, Spark, etc. Because of compatibility issues, the compilation may fail, which can reduce the compatibility issues of these basic engines. -2. Hadoop, Hive, and Spark 3.x are very mature, and lower versions of the engine may have potential risks. Many users in the community use the 3.x version by default, so consider changing the default compiled version of Linkis to 3.x. - -## 2. Instructions for use - -## 2.1 Default version adjustment instructions - -Linkis 1.4.0 changes the default versions of Hadoop, Hive, and Spark to 3.x, and the specific versions are Hadoop 3.3.4, Hive 3.1.3, and Spark 3.2.1. - -## 2.2 Different version adaptation - -To compile different hive versions, we only need to specify `-D=xxx`, for example: -``` -mvn clean install package -Dhive.version=2.3.3 -``` -To compile different versions of spark, we only need to specify `-D=xxx`. Common usage scenarios are as follows: -``` -#spark3+hadoop3 -mvn install package - -#spark3+hadoop2 -mvn install package -Phadoop-2.7 - -#spark2+hadoop2 -mvn install package -Pspark-2.4 -Phadoop-2.7 - -#spark2+ hadoop3 -mvn install package -Pspark-2.4 -``` -## 3. Precautions -1. When the default version is compiled, the basic version is: hadoop3.3.4 + hive3.1.3 + spark3.2.1 -``` -mvn install package -``` -Due to the default version upgrade of the default base engine, `spark-3.2`, `hadoop-3.3` and `spark-2.4-hadoop-3.3` profiles were removed, and profiles `hadoop-2.7` and `spark-2.4` were added. - -2. The sub-version of spark can be specified by `-Dspark.version=xxx`. The default scala version used by the system is 2.12.17, which can be adapted to spark 3.x version. To compile spark 2.x, you need to use scala 2.11 version. Can be compiled with -Pspark-2.4 parameter, or -Dspark.version=2.xx -Dscala.version=2.11.12 -Dscala.binary.version=2.11. - -3. The subversion of hadoop can be specified by `-Dhadoop.version=xxx` - -for example : -``` -mvn install package -Pspark-3.2 -Phadoop-3.3 -Dspark.version=3.1.3 -``` - -4. Version 2.x of hive needs to rely on jersey. Hive EC does not add jersey dependency when compiling by default. You can compile it through the following guidelines. - -**Compile hive version 2.3.3** - -When compiling hive EC, the profile that activates adding jersey dependencies when specifying version 2.3.3 is added by default. Users can compile by specifying the -Dhive.version=2.3.3 parameter - -**Compile other hive 2.x versions** - -Modify the linkis-engineconn-plugins/hive/pom.xml file, modify 2.3.3 to the user-compiled version, such as 2.1.0 -```xml - - hive-jersey-dependencies - - - hive.version - - 2.1.0 - - - ... - -``` -Add -Dhive.version=2.1.0 parameter when compiling. diff --git a/docs/feature/datasource-generate-sql.md b/docs/feature/datasource-generate-sql.md deleted file mode 100644 index e9b0ec5a341..00000000000 --- a/docs/feature/datasource-generate-sql.md +++ /dev/null @@ -1,149 +0,0 @@ ---- -title: Generate SQL according to the data source -sidebar_position: 0.5 ---- - -## 1. Background -Generate SparkSQL and JdbcSQL based on data source information, including DDL, DML, and DQL. - -## 2. Instructions for use -### generate SparkSQL - -Interface address: /api/rest_j/v1/metadataQuery/getSparkSql - -Request method: GET - -Request data type: application/x-www-form-urlencoded - -Request parameters: - -| Parameter name | Description | Required | Data type | -|-------------------------------|-------|-----|--| -| `dataSourceName` | data source name | is | String | -| `system` | system name | is | String | -| `database` | database name | is | String | -| `table` | table name | is | String | - -Example response: - -```json -{ - "method": null, - "status": 0, - "message": "OK", - "data": { - "sparkSql": { - "ddl": "CREATE TEMPORARY TABLE test USING org.apache.spark.sql.jdbc OPTIONS ( url 'jdbc:mysql://localhost:3306/test', dbtable 'test', user 'root', password 'password' )", - "dml": "INSERT INTO test SELECT * FROM ${resultTable}", - "dql": "SELECT id,name FROM test" - } - } -} -``` -Currently supports jdbc, kafka, elasticsearch, mongo data source, you can register spark table according to SparkSQLDDL for query - -### Generate JdbcSQL - -Interface address: /api/rest_j/v1/metadataQuery/getJdbcSql - -Request method: GET - -Request data type: application/x-www-form-urlencoded - -Request parameters: - -| Parameter name | Description | Required | Data type | -|-------------------------------|-------|-----|--| -| `dataSourceName` | data source name | is | String | -| `system` | system name | is | String | -| `database` | database name | is | String | -| `table` | table name | is | String | - -Example response: - -```json -{ - "method": null, - "status": 0, - "message": "OK", - "data": { - "jdbcSql": { - "ddl": "CREATE TABLE `test` (\n\t `id` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL COMMENT 'The column name is id',\n\t `name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL COMMENT 'The column name is name',\n\t PRIMARY KEY (`id`)\n\t) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci", - "dml": "INSERT INTO test SELECT * FROM ${resultTable}", - "dql": "SELECT id,name FROM test" - } - } -} -``` - -Currently supports JDBC data sources, such as: mysql, oracle, postgres, etc. JdbcSQLDDL can be used for front-end display. - -## 3. Precautions -1. You need to register the data source first - -## 4. Implementation principle -### Generate SparkSQL implementation principle -Define DDL_SQL_TEMPLATE to obtain data source information for replacement -```java - public static final String JDBC_DDL_SQL_TEMPLATE = - "CREATE TEMPORARY TABLE %s" - + "USING org.apache.spark.sql.jdbc" - + "OPTIONS (" - + "url '%s'," - + "dbtable '%s'," - + " user '%s'," - + "password '%s'" - + ")"; -``` - -### Generate JdbcSQL implementation principle -Splicing DDL according to table schema information -```java -public String generateJdbcDdlSql(String database, String table) { - StringBuilder ddl = new StringBuilder(); - ddl.append("CREATE TABLE ").append(String.format("%s.%s", database, table)).append(" ("); - - try { - List < MetaColumnInfo > columns = getColumns(database, table); - if (CollectionUtils. isNotEmpty(columns)) { - for (MetaColumnInfo column: columns) { - ddl.append("\n\t").append(column.getName()).append(" ").append(column.getType()); - if (column. getLength() > 0) { - ddl.append("(").append(column.getLength()).append(")"); - } - if (!column. isNullable()) { - ddl.append("NOT NULL"); - } - ddl.append(","); - } - String primaryKeys = - columns. stream() - .filter(MetaColumnInfo::isPrimaryKey) - .map(MetaColumnInfo::getName) - .collect(Collectors.joining(", ")); - if (StringUtils. isNotBlank(primaryKeys)) { - ddl.append(String.format("\n\tPRIMARY KEY (%s),", primaryKeys)); - } - ddl. deleteCharAt(ddl. length() - 1); - } - } catch (Exception e) { - LOG.warn("Fail to get Sql columns(Failed to get the field list)"); - } - - ddl.append("\n)"); - - return ddl. toString(); -} -``` - -Some data sources support direct access to DDL - -**mysql** -```sql -SHOW CREATE TABLE 'table' -``` - -**oracle** -```sql -SELECT DBMS_METADATA.GET_DDL('TABLE', 'table', 'database') AS DDL FROM DUAL -``` \ No newline at end of file diff --git a/docs/feature/hive-engine-support-concurrent.md b/docs/feature/hive-engine-support-concurrent.md deleted file mode 100644 index fc19a66f916..00000000000 --- a/docs/feature/hive-engine-support-concurrent.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: hive engine supports concurrency and multiplexing -sidebar_position: 0.3 ---- - -## 1. Requirement background -hiveEngineConn supports concurrency, reduces the resource consumption of starting the hive engine, and improves the engine reuse rate. - -## 2. Instructions for use -First, modify the linkis-engineconn.properties file in the linkis-engineconn-plugins/hive/src/main/resources directory, -And set linkis.hive.engineconn.concurrent.support to true. -``` -# Support parallel execution -wds.linkis.engineconn.support.parallelism=true - -# Concurrency limit, the default is 10 -linkis.hive.engineconn.concurrent.limit=10 -``` - -Submit a hive job, and when the first job is complete, submit another job. You can see that the hive engine has been reused. - -Restart the cg-linkismanager service after configuration modification, or make the configuration take effect through [Engine Refresh API](../api/http/linkis-cg-engineplugin-api/engineconn-plugin-refresh.md). -## 3. Precautions -1. Wait for the first hive task to execute successfully before submitting the second hive task. Submitting multiple tasks at the same time for the first time may cause multiple ECs to be started due to no available ECs. \ No newline at end of file diff --git a/docs/feature/other.md b/docs/feature/other.md deleted file mode 100644 index aeb42806873..00000000000 --- a/docs/feature/other.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -title: Description of other features -sidebar_position: 0.6 ---- - -## 1. Do not kill EC when ECM restarts -When the ECM restarts, there is an option not to kill the engine, but to take over the existing surviving engine. Makes the Engine Connection Manager (ECM) service stateless. - -## 2. Remove json4s dependency -Different versions of spark depend on different json4s versions, which is not conducive to the support of multiple versions of spark. We need to reduce this json4s dependency and remove json4s from linkis. -For example: spark2.4 needs json4s v3.5.3, spark3.2 needs json4s v3.7.0-M11. - -## 3. EngineConn module definition depends on engine version -The version definition of the engine is in `EngineConn` by default. Once the relevant version is changed, it needs to be modified in many places. We can put the relevant version definition in the top-level pom file. When compiling a specified engine module, it needs to be compiled in the project root directory, and use `-pl` to compile the specific engine module, for example: -``` -mvn install package -pl linkis-engineconn-plugins/spark -Dspark.version=3.2.1 -``` -The version of the engine can be specified by the -D parameter of mvn compilation, such as -Dspark.version=xxx, -Dpresto.version=0.235 -At present, all the underlying engine versions have been moved to the top-level pom file. When compiling the specified engine module, it needs to be compiled in the project root directory, and `-pl` is used to compile the specific engine module. - -## 4. Linkis main version number modification instructions - -Linkis will no longer be upgraded by minor version after version 1.3.2. The next version will be 1.4.0, and the version number will be 1.5.0, 1.6.0 and so on. When encountering a major defect in a released version that needs to be fixed, it will pull a minor version to fix the defect, such as 1.4.1. - - -## 5. LInkis code submission main branch instructions - -The modified code of Linkis 1.3.2 and earlier versions is merged into the dev branch by default. In fact, the development community of Apache Linkis is very active, and new development requirements or repair functions will be submitted to the dev branch, but when users visit the Linkis code base, the master branch is displayed by default. Since we only release a new version every quarter, it seems that the community is not very active from the perspective of the master branch. Therefore, we decided to merge the code submitted by developers into the master branch by default starting from version 1.4.0. diff --git a/docs/feature/overview.md b/docs/feature/overview.md deleted file mode 100644 index 556ea5b0faf..00000000000 --- a/docs/feature/overview.md +++ /dev/null @@ -1,33 +0,0 @@ ---- -title: version overview -sidebar_position: 0.1 ---- - -- [Base engine dependencies, compatibility, default version optimization](./base-engine-compatibilty.md) -- [Hive engine connector supports concurrent tasks](./hive-engine-support-concurrent.md) -- [Support more datasources](../user-guide/datasource-manual#31-jdbc-datasource) -- [Spark ETL enhancements](./spark-etl.md) -- [Generate SQL from data source](./datasource-generate-sql.md) -- [linkis-storage supports S3 file system (experimental version)](../deployment/deploy-quick#343-s3-mode) -- [add postgresql database support (experimental version)](../deployment/deploy-quick#22-configuration database information) -- [Add impala engine support (experimental version)](../engine-usage/impala.md) -- [Other feature description](./other.md) -- [version of Release-Notes](/download/release-notes-1.5.0) - -## Parameter changes - -| module name (service name) | type | parameter name | default value | description | -| ----------- | ----- | ------------------------------- ------------------------- | ---------------- | ------- --------------------------------------------------- | -| mg-eureka | New | eureka.instance.metadata-map.linkis.app.version | ${linkis.app.version} | Eureka metadata reports Linkis application version information| -| mg-eureka | Add | eureka.instance.metadata-map.linkis.conf.version | None | Eureka metadata report Linkis service version information | -| mg-eureka | Modify | eureka.client.registry-fetch-interval-seconds | 8 | Eureka Client pull service registration information interval (seconds) | -| mg-eureka | New | eureka.instance.lease-renewal-interval-in-seconds | 4 | The frequency (seconds) at which the eureka client sends heartbeats to the server | -| mg-eureka | new | eureka.instance.lease-expiration-duration-in-seconds | 12 | eureka waits for the next heartbeat timeout (seconds)| -| EC-shell | Modify | wds.linkis.engineconn.support.parallelism | true | Whether to enable parallel execution of shell tasks | -| EC-shell | Modify | linkis.engineconn.shell.concurrent.limit | 15 | Concurrent number of shell tasks | -| Entrance | Modify | linkis.entrance.auto.clean.dirty.data.enable | true | Whether to clean dirty data at startup | - - - -## Database table changes -For details, see the upgrade schema `db/upgrade/1.5.0_schema` file in the corresponding branch of the code warehouse (https://github.com/apache/linkis) \ No newline at end of file diff --git a/docs/feature/spark-etl.md b/docs/feature/spark-etl.md deleted file mode 100644 index 2151cb74cf9..00000000000 --- a/docs/feature/spark-etl.md +++ /dev/null @@ -1,642 +0,0 @@ ---- -title: Support spark ETL data synchronization -sidebar_position: 0.4 ---- - -## 1. Background -Using the Spark ETL function, users can synchronize Spark data by configuring json. - -## 2. Supported types - -currently supported types -```text -jdbc, file, redis, kafka, elasticsearch, mongo, datalake (hudi, delta) -``` - -## 3. General configuration instructions -```text -name: data source name -type: Contains `source`, `transformation`, `sink`, corresponding to input, transformation, and output respectively -options: configuration parameters -saveMode: save mode, currently supports: `overwrite` and `append` -path: file path, can be: 'file://' or 'hdfs://'(default) -`resultTable` needs to correspond to `sourceTable` -``` - -## 4. Instructions for use - -### 4.1 Add the required jar package -When using the data source, you need to upload the corresponding spark connector jar to the spark/jars directory, the directory location is $SPARK_HOME/jars - -The spark connector jar can be obtained by the following command - -```text -git clone https://github.com/apache/linkis.git - -cd link is - -git checkout master - -cd linkis-engineconn-plugins/spark/scala-2.12 - -mvn clean install -Dmaven.test.skip=true -``` - -The compiled spark connector jar is located in the following directory -```text -linkis/linkis-engineconn-plugins/spark/scala-2.12/target/out/spark/dist/3.2.1/lib -``` - -### 4.2 linkis-cli submit task example - -Just pass in the specific json code in code, pay attention to the conversion of quotation marks. - -```shell -sh /appcom/Install/linkis/bin/linkis-cli -engineType spark-3.2.1 -codeType data_calc -code "" -submitUser hadoop -proxyUser hadoop -``` - -Linkis-cli submits redis data synchronization task example -```shell -sh ./bin/linkis-cli -engineType spark-3.2.1 -codeType data_calc -code "{\"plugins\":[{\"name\":\"file\",\"type\":\" source\",\"config\":{\"resultTable\":\"test\",\"path\":\"hdfs://linkishdfs/tmp/linkis/spark_etl_test/etltest.dolphin\",\ "serializer\":\"csv\",\"options\":{\"header\":\"true\",\"delimiter\":\";\"},\"columnNames\":[ \"name\",\"age\"]}},{\"name\":\"redis\",\"type\":\"sink\",\"config\":{\"sourceTable \":\"test\",\"host\":\"wds07\",\"port\":\"6679\",\"auth\":\"password\",\"targetTable\" :\"spark_etl_test\",\"saveMode\":\"append\"}}]}" -submitUser hadoop -proxyUser hadoop -``` -### 4.3 Synchronization json script description of each data source - -#### 4.3.1 jdbc - -Configuration instructions -```text -url: jdbc connection information -user: user name -password: password -query: sql query statement -``` - -json code - -```json -{ - "sources": [ - { - "name": "jdbc", - "type": "source", - "config": { - "resultTable": "test1", - "url": "jdbc:mysql://127.0.0.1:3306/dip_linkis?characterEncoding=UTF-8", - "driver": "com.mysql.jdbc.Driver", - "user": "root", - "password": "123456", - "query": "select * from dip_linkis.linkis_ps_udf_baseinfo", - "options": { - } - } - } - ], - "transformations": [ - { - "name": "sql", - "type": "transformation", - "config": { - "resultTable": "T1654611700631", - "sql": "select * from test1" - } - } - ], - "sinks": [ - { - "name": "jdbc", - "type": "sink", - "config": { - "sourceTable": "T1654611700631", - "url": "jdbc:mysql://127.0.0.1:3306/dip_linkis?characterEncoding=UTF-8", - "driver": "com.mysql.jdbc.Driver", - "user": "root", - "password": "123456", - "targetTable": "linkis_ps_udf_baseinfo2", - "options": { - } - } - } - ] -} -``` - -A new jar needs to be added, and the corresponding jar should be selected according to the specific data source used -```text -DmJdbcDriver18.jar -kingbase8-8.6.0.jar -postgresql-42.3.8.jar -``` - -#### 4.3.2 file - -Configuration instructions - -```text -serializer: file format, can be `csv`, `parquet`, etc. -columnNames: column names -``` - - -json code - -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "test2", - "path": "hdfs:///tmp/test_new_no_partition", - "serializer": "csv", - "columnNames": ["id", "create_user", "udf_name", "udf_type", "tree_id", "create_time", "update_time", "sys", "cluster_name", "is_expire", "is_shared"] - } - } - ], - "sinks": [ - { - "name": "file", - "config": { - "sourceTable": "test2", - "path": "hdfs:///tmp/test_new", - "partitionBy": ["create_user"], - "saveMode": "overwrite", - "serializer": "csv" - } - } - ] -} -``` - -Need to add new jar -``` -spark-excel-2.12.17-3.2.2_2.12-3.2.2_0.18.1.jar -``` - -#### 4.3.3 redis - -```text -sourceTable: source table, -host: ip address, -port": port, -auth": password, -targetTable: target table, -saveMode: support append -``` - -json code -```json -{ - "plugins":[ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "test", - "path": "hdfs://linkishdfs/tmp/linkis/spark_etl_test/etltest.dolphin", - "serializer": "csv", - "options": { - "header": "true", - "delimiter": ";" - }, - "columnNames": ["name", "age"] - } - }, - { - "name": "redis", - "type": "sink", - "config": { - "sourceTable": "test", - "host": "wds07", - "port": "6679", - "auth": "password", - "targetTable": "spark_etl_test", - "saveMode": "append" - } - } - ] -} -``` - -Need to add new jar -```text -jedis-3.2.0.jar -commons-pool2-2.8.1.jar -spark-redis_2.12-2.6.0.jar -``` - -#### 4.3.4 kafka - -Configuration instructions -```text -servers: kafka connection information -mode: currently supports `batch` and `stream` -topic: kafka topic name -``` - -Data written to json code -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "path": "file://{filePath}/etltest.dolphin", - "serializer": "csv", - "options": { - "header": "true", - "delimiter": ";" - }, - "columnNames": ["name", "age"] - } - } - ], - "sinks": [ - { - "name": "kafka", - "config": { - "sourceTable": "T1654611700631", - "servers": "localhost:9092", - "mode": "batch", - "topic": "test121212" - } - } - ] -} -``` - -Data read json code -```json -{ - "sources": [ - { - "name": "kafka", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "servers": "localhost:9092", - "topic": "test121212" - } - } - ], - "sinks": [ - { - "name": "kafka", - "config": { - "sourceTable": "T1654611700631", - "servers": "localhost:9092", - "mode": "stream", - "topic": "test55555" - } - } - ] -} -``` - -Need to add new jar -``` -kafka-clients-2.8.0.jar -spark-sql-kafka-0-10_2.12-3.2.1.jar -spark-token-provider-kafka-0-10_2.12-3.2.1.jar -``` - -#### 4.3.5 elasticsearch - -Configuration instructions -```text -node: elasticsearch ip -port: elasticsearch port -index: elasticsearch index name -``` - - -Data written to json code -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "path": "file://{filePath}/etltest.dolphin", - "serializer": "csv", - "options": { - "header": "true", - "delimiter": ";" - }, - "columnNames": ["name", "age"] - } - } - ], - "sinks": [ - { - "name": "elasticsearch", - "config": { - "sourceTable": "T1654611700631", - "node": "localhost", - "port": "9200", - "index": "estest", - "saveMode": "overwrite" - } - } - ] -} -``` - -Data read json code -```json -{ - "sources": [ - { - "name": "elasticsearch", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "node": "localhost", - "port": "9200", - "index": "estest" - } - } - ], - "sinks": [ - { - "name": "file", - "config": { - "sourceTable": "T1654611700631", - "path": "file://{filePath}/csv", - "saveMode": "overwrite", - "serializer": "csv" - } - } - ] -} -``` - -Need to add new jar -``` -elasticsearch-spark-30_2.12-7.17.7.jar -``` - -#### 4.3.6 mongo - -Configuration instructions -```text -uri: mongo connection information -database: mongo database -collection: mongo collection -``` - - -Data written to json code -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "path": "file://{filePath}/etltest.dolphin", - "serializer": "csv", - "options": { - "header": "true", - "delimiter": ";" - }, - "columnNames": ["name", "age"] - } - } - ], - "sinks": [ - { - "name": "mongo", - "config": { - "sourceTable": "T1654611700631", - "uri": "mongodb://localhost:27017/test", - "database": "test", - "collection": "test", - "saveMode": "overwrite" - } - } - ] -} -``` - -Data read json code -```json -{ - "sources": [ - { - "name": "mongo", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "uri": "mongodb://localhost:27017/test", - "database": "test", - "collection": "test" - } - } - ], - "sinks": [ - { - "name": "file", - "config": { - "sourceTable": "T1654611700631", - "path": "file://{filePath}/json", - "saveMode": "overwrite", - "serializer": "json" - } - } - ] -} -``` - -Need to add new jar -``` -bson-3.12.8.jar -mongo-spark-connector_2.12-3.0.1.jar -mongodb-driver-core-3.12.8.jar -mongodb-driver-sync-3.12.8.jar -``` - -#### 4.3.7 delta - -Configuration instructions -```text -tableFormat: currently supports `hudi` and `delta` -``` - - -Data written to json code -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "path": "file://{filePath}/etltest.dolphin", - "serializer": "csv", - "options": { - "header": "true", - "delimiter": ";" - }, - "columnNames": ["name", "age"] - } - } - ], - "sinks": [ - { - "name": "datalake", - "config": { - "sourceTable": "T1654611700631", - "tableFormat": "delta", - "path": "file://{filePath}/delta", - "saveMode": "overwrite" - } - } - ] -} -``` - -Data read json code -```json -{ - "sources": [ - { - "name": "datalake", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "tableFormat": "delta", - "path": "file://{filePath}/delta", - } - } - ], - "sinks": [ - { - "name": "file", - "config": { - "sourceTable": "T1654611700631", - "path": "file://{filePath}/csv", - "saveMode": "overwrite", - "options": { - "header": "true" - }, - "serializer": "csv" - } - } - ] -} -``` - -Need to add new jar -``` -delta-core_2.12-2.0.2.jar -delta-storage-2.0.2.jar -``` - -#### 4.3.8 hudi - -Configuration instructions -```text -tableFormat: currently supports `hudi` and `delta` -``` - - -Data written to json code -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "path": "file://{filePath}/etltest.dolphin", - "serializer": "csv", - "options": { - "header": "true", - "delimiter": ";" - }, - "columnNames": ["name", "age"] - } - } - ], - "transformations": [ - { - "name": "sql", - "type": "transformation", - "config": { - "resultTable": "T111", - "sql": "select * from T1654611700631" - } - } - ], - "sinks": [ - { - "name": "datalake", - "config": { - "sourceTable": "T1654611700631", - "tableFormat": "hudi", - "options": { - "hoodie.table.name": "huditest", - "hoodie.datasource.write.recordkey.field": "age", - "hoodie.datasource.write.precombine.field":"age" - }, - "path": "file://{filePath}/hudi", - "saveMode": "append" - } - } - ] -} -``` - -Data read json code -```json -{ - "sources": [ - { - "name": "datalake", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "tableFormat": "hudi", - "path": "file://{filePath}/hudi", - } - } - ], - "transformations": [ - { - "name": "sql", - "type": "transformation", - "config": { - "resultTable": "T111", - "sql": "select * from T1654611700631" - } - } - ], - "sinks": [ - { - "name": "file", - "config": { - "sourceTable": "T1654611700631", - "path": "file://{filePath}/csv", - "saveMode": "overwrite", - "options": { - "header": "true" - }, - "serializer": "csv" - } - } - ] -} -``` - -Need to add new jar -``` -hudi-spark3.2-bundle_2.12-0.13.0.jar -``` \ No newline at end of file diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/_category_.json b/i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/_category_.json deleted file mode 100644 index c862e6a30c2..00000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/_category_.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "label": "版本特性", - "position": 1.5 -} \ No newline at end of file diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/base-engine-compatibilty.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/base-engine-compatibilty.md deleted file mode 100644 index 83229cbe6b4..00000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/base-engine-compatibilty.md +++ /dev/null @@ -1,75 +0,0 @@ ---- -title: 基础引擎依赖性、兼容性、默认版本优化 -sidebar_position: 0.2 ---- - -## 1. 需求背景 -1. 低版本 linkis 需要通过修改代码来适配不同的 Hive、Spark 等版本,因为兼容性问题,编译可能会失败,可以减少这些基础引擎的兼容性问题。 -2. Hadoop、Hive、Spark 3.x 已经很成熟,并且低版本的引擎可能有潜在的风险点,社区很多用户默认使用 3.x 版本,因此考虑将 Linkis 默认编译的版本修改为 3.x 。 - -## 2. 使用说明 - -## 2.1 默认版本调整说明 - -Linkis 1.4.0 将 Hadoop、Hive、Spark 默认版本修改为 3.x,具体版本分别为 Hadoop 3.3.4、Hive 3.1.3、Spark 3.2.1 。 - -## 2.2 不同版本适配 - -不同的hive版本的编译,我们只需要指定`-D=xxx`就可以了,比如: -``` -mvn clean install package -Dhive.version=2.3.3 -``` -不同版本的spark编译,我们也只需要指定`-D=xxx`就可以了,常用的使用场景如下: -``` -#spark3+hadoop3 -mvn install package - -#spark3+hadoop2 -mvn install package -Phadoop-2.7 - -#spark2+hadoop2 -mvn install package -Pspark-2.4 -Phadoop-2.7 - -#spark2+ hadoop3 -mvn install package -Pspark-2.4 -``` -## 3. 注意事项 -1. 默认版本编译时,基础版本为:hadoop3.3.4 + hive3.1.3 + spark3.2.1 -``` -mvn install package -``` -由于默认基础引擎的默认版本升级,`spark-3.2`、`hadoop-3.3`和`spark-2.4-hadoop-3.3` profile被移除,新增profile `hadoop-2.7` and `spark-2.4`。 - -2. spark的子版本可以通过`-Dspark.version=xxx` 来指定,系统默认使用的 scala 版本为 2.12.17,可适配 spark 3.x 版本 。如需编译 spark 2.x,需要使用 scala 2.11 版本。可通过 -Pspark-2.4 参数,或者 -Dspark.version=2.xx -Dscala.version=2.11.12 -Dscala.binary.version=2.11 编译。 - -3. hadoop的子版本可以通过`-Dhadoop.version=xxx` 来指定 - -举个例子 : -``` -mvn install package -Pspark-3.2 -Phadoop-3.3 -Dspark.version=3.1.3 -``` - -4. hive 2.x 版本需要依赖 jersey,hive EC 默认编译时未添加 jersey依赖,可通过如下指引编译。 - -**编译 hive 2.3.3 版本** - -编译 hive EC 时默认添加了指定 2.3.3 版本时激活添加 jersey 依赖的 profile,用户可通过指定 -Dhive.version=2.3.3 参数编译 - -**编译其它 hive 2.x 版本** - -修改 linkis-engineconn-plugins/hive/pom.xml 文件,将 2.3.3 修改为用户编译版本,如 2.1.0 -```xml - - hive-jersey-dependencies - - - hive.version - - 2.1.0 - - - ... - -``` -编译时添加 -Dhive.version=2.1.0 参数。 - diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/datasource-generate-sql.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/datasource-generate-sql.md deleted file mode 100644 index 1750237fdee..00000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/datasource-generate-sql.md +++ /dev/null @@ -1,149 +0,0 @@ ---- -title: 根据数据源生成SQL -sidebar_position: 0.5 ---- - -## 1. 背景 -根据数据源信息生成 SparkSQL 和 JdbcSQL,包含DDL、DML、DQL。 - -## 2. 使用说明 -### 生成SparkSQL - -接口地址:/api/rest_j/v1/metadataQuery/getSparkSql - -请求方式:GET - -请求数据类型:application/x-www-form-urlencoded - -请求参数: - -| 参数名 | 说明 | 是否必须 | 数据类型 | -|------------------------------|-------|-----|--| -| `dataSourceName` | 数据源名称 | 是 | String | -| `system` | 系统名称 | 是 | String | -| `database` | 数据库名称 | 是 | String | -| `table` | 表名称 | 是 | String | - -响应示例: - -```json -{ - "method": null, - "status": 0, - "message": "OK", - "data": { - "sparkSql": { - "ddl": "CREATE TEMPORARY TABLE test USING org.apache.spark.sql.jdbc OPTIONS ( url 'jdbc:mysql://localhost:3306/test', dbtable 'test', user 'root', password 'password')", - "dml": "INSERT INTO test SELECT * FROM ${resultTable}", - "dql": "SELECT id,name FROM test" - } - } -} -``` -目前支持jdbc、kafka、elasticsearch、mongo 数据源,可以根据SparkSQLDDL注册 spark table 进行查询 - -### 生成JdbcSQL - -接口地址:/api/rest_j/v1/metadataQuery/getJdbcSql - -请求方式:GET - -请求数据类型:application/x-www-form-urlencoded - -请求参数: - -| 参数名 | 说明 | 是否必须 | 数据类型 | -|------------------------------|-------|-----|--| -| `dataSourceName` | 数据源名称 | 是 | String | -| `system` | 系统名称 | 是 | String | -| `database` | 数据库名称 | 是 | String | -| `table` | 表名称 | 是 | String | - -响应示例: - -```json -{ - "method": null, - "status": 0, - "message": "OK", - "data": { - "jdbcSql": { - "ddl": "CREATE TABLE `test` (\n\t `id` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL COMMENT '列名是id',\n\t `name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL COMMENT '列名是name',\n\t PRIMARY KEY (`id`)\n\t) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci", - "dml": "INSERT INTO test SELECT * FROM ${resultTable}", - "dql": "SELECT id,name FROM test" - } - } -} -``` - -目前支持 JDBC 数据源,如:mysql、oracle、postgres等,JdbcSQLDDL可以用于前端展示。 - -## 3. 注意事项 -1. 需要先注册数据源 - -## 4. 实现原理 -### 生成SparkSQL实现原理 -定义DDL_SQL_TEMPLATE,获取数据源信息进行替换 -```java - public static final String JDBC_DDL_SQL_TEMPLATE = - "CREATE TEMPORARY TABLE %s " - + "USING org.apache.spark.sql.jdbc " - + "OPTIONS (" - + " url '%s'," - + " dbtable '%s'," - + " user '%s'," - + " password '%s'" - + ")"; -``` - -### 生成JdbcSQL实现原理 -根据表schema信息拼接DDL -```java -public String generateJdbcDdlSql(String database, String table) { - StringBuilder ddl = new StringBuilder(); - ddl.append("CREATE TABLE ").append(String.format("%s.%s", database, table)).append(" ("); - - try { - List < MetaColumnInfo > columns = getColumns(database, table); - if (CollectionUtils.isNotEmpty(columns)) { - for (MetaColumnInfo column: columns) { - ddl.append("\n\t").append(column.getName()).append(" ").append(column.getType()); - if (column.getLength() > 0) { - ddl.append("(").append(column.getLength()).append(")"); - } - if (!column.isNullable()) { - ddl.append(" NOT NULL"); - } - ddl.append(","); - } - String primaryKeys = - columns.stream() - .filter(MetaColumnInfo::isPrimaryKey) - .map(MetaColumnInfo::getName) - .collect(Collectors.joining(", ")); - if (StringUtils.isNotBlank(primaryKeys)) { - ddl.append(String.format("\n\tPRIMARY KEY (%s),", primaryKeys)); - } - ddl.deleteCharAt(ddl.length() - 1); - } - } catch (Exception e) { - LOG.warn("Fail to get Sql columns(获取字段列表失败)"); - } - - ddl.append("\n)"); - - return ddl.toString(); -} -``` - -部分数据源支持直接获取DDL - -**mysql** -```sql -SHOW CREATE TABLE 'table' -``` - -**oracle** -```sql -SELECT DBMS_METADATA.GET_DDL('TABLE', 'table', 'database') AS DDL FROM DUAL -``` \ No newline at end of file diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/hive-engine-support-concurrent.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/hive-engine-support-concurrent.md deleted file mode 100644 index a07ebba71c9..00000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/hive-engine-support-concurrent.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: hive engine支持并发,支持复用 -sidebar_position: 0.3 ---- - -## 1. 需求背景 -hiveEngineConn支持并发,减少启动hive引擎的资源消耗,提高引擎复用率。 - -## 2. 使用说明 -首先,在linkis-engineconn-plugins/hive/src/main/resources目录下修改linkis-engineconn.properties文件, -并将linkis.hive.engineconn.concurrent.support设置为true。 -``` -# 支持并行执行 -wds.linkis.engineconn.support.parallelism=true - -# 并发数限制,默认为 10 -linkis.hive.engineconn.concurrent.limit=10 -``` - -提交一个hive任务,当第一个任务完成后,再提交另一个任务。您可以看到hive引擎已被重用。 - -配置修改后重启 cg-linkismanager 服务,或通过 [引擎刷新接口](../api/http/linkis-cg-engineplugin-api/engineconn-plugin-refresh.md) 使配置生效。 -## 3. 注意事项 -1、等待第一个hive任务执行成功后,再提交第二个hive任务。初次同时提交多个任务可能由于暂无可用的 EC 导致启动多个 EC。 \ No newline at end of file diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/other.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/other.md deleted file mode 100644 index db4ddc1c284..00000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/other.md +++ /dev/null @@ -1,29 +0,0 @@ ---- -title: 其它特性说明 -sidebar_position: 0.6 ---- - -## 1. ECM 重启时不 kill EC -当ECM重新启动时,可以选择不杀死引擎,而是可以接管现有的存活引擎。使引擎连接管理器 (ECM) 服务无状态。 - -## 2. 移除 json4s 依赖 -spark 不同版本依赖不同的json4s版本,不利于spark多版本的支持,我们需要减少这个json4s依赖,从linkis中移除了json4s. -比如: spark2.4 需要json4s v3.5.3, spark3.2需要json4s v3.7.0-M11。 - -## 3. EngineConn模块定义依赖引擎版本 -引擎的版本定义默认在 `EngineConn`中,一旦相关版本变更,需要修改多处,我们可以把相关的版本定义统一放到顶层pom文件中。编译指定引擎模块时,需要在项目根目录编译,并使用`-pl`来编译具体的引擎模块,比如: -``` -mvn install package -pl linkis-engineconn-plugins/spark -Dspark.version=3.2.1 -``` -引擎的版本可以通过mvn编译-D参数来指定,比如 -Dspark.version=xxx 、 -Dpresto.version=0.235 -目前所有的底层引擎版本新都已经移到顶层pom文件中,编译指定引擎模块时,需要在项目根目录编译,并使用`-pl`来编译具体的引擎模块。 - -## 4. Linkis 主版本号修改说明 - -Linkis 从 1.3.2 版本后将不再按小版本升级,下一个版本为 1.4.0,再往后升级时版本号为1.5.0,1.6.0 以此类推。当遇到某个发布版本有重大缺陷需要修复时会拉取小版本修复缺陷,如 1.4.1 。 - - -## 5. LInkis 代码提交主分支说明 - -Linkis 1.3.2 及之前版本修改代码默认是合并到 dev 分支。实际上 Apache Linkis 的开发社区很活跃,对于新开发的需求或修复功能都会提交到 dev 分支,但是用户访问 Linkis 代码库的时候默认显示的是 master 分支。由于我们一个季度才会发布一个新版本,从 master 分支来看显得社区活跃的不高。因此我们决定从 1.4.0 版本开始,将开发者提交的代码默认合并到 master 分支。 - diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/overview.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/overview.md deleted file mode 100644 index e90e4503f65..00000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/overview.md +++ /dev/null @@ -1,33 +0,0 @@ ---- -title: 版本总览 -sidebar_position: 0.1 ---- - -- [基础引擎依赖性、兼容性、默认版本优化](./base-engine-compatibilty.md) -- [Hive 引擎连接器支持并发任务](./hive-engine-support-concurrent.md) -- [支持更多的数据源](../user-guide/datasource-manual#31-jdbc-数据源) -- [Spark ETL 功能增强](./spark-etl.md) -- [根据数据源生成SQL](./datasource-generate-sql.md) -- [linkis-storage 支持 S3 文件系统(实验版本)](../deployment/deploy-quick#343-s3-模式) -- [增加 postgresql 数据库支持(实验版本)](../deployment/deploy-quick#22-配置数据库信息) -- [增加 impala 引擎支持(实验版本)](../engine-usage/impala.md) -- [其它特性说明](./other.md) -- [版本的 Release-Notes](/download/release-notes-1.5.0) - -## 参数变化 - -| 模块名(服务名)| 类型 | 参数名 | 默认值 | 描述 | -| ----------- | ----- | -------------------------------------------------------- | ---------------- | ------------------------------------------------------- | -| mg-eureka | 新增 | eureka.instance.metadata-map.linkis.app.version | ${linkis.app.version} | Eureka元数据上报Linkis应用版本信息| -| mg-eureka | 新增 | eureka.instance.metadata-map.linkis.conf.version | 无 | Eureka元数据上报Linkis服务版本信息 | -| mg-eureka | 修改 | eureka.client.registry-fetch-interval-seconds | 8 | Eureka Client拉取服务注册信息间隔时间(秒) | -| mg-eureka | 新增 | eureka.instance.lease-renewal-interval-in-seconds | 4 | eureka client发送心跳给server端的频率(秒)| -| mg-eureka | 新增 | eureka.instance.lease-expiration-duration-in-seconds | 12 | eureka 等待下一次心跳的超时时间(秒)| -| EC-shell | 修改 | wds.linkis.engineconn.support.parallelism | true | 是否开启 shell 任务并行执行| -| EC-shell | 修改 | linkis.engineconn.shell.concurrent.limit | 15 | shell 任务并发数 | -| Entrance | 修改 | linkis.entrance.auto.clean.dirty.data.enable | true | 启动时是否清理脏数据 | - - - -## 数据库表变化 -详细见代码仓库(https://github.com/apache/linkis) 对应分支中的升级schema`db/upgrade/1.5.0_schema`文件 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/spark-etl.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/spark-etl.md deleted file mode 100644 index 6f486f4515a..00000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/feature/spark-etl.md +++ /dev/null @@ -1,642 +0,0 @@ ---- -title: 支持 spark ETL 数据同步 -sidebar_position: 0.4 ---- - -## 1. 背景 -使用 Spark ETL 功能,用户可以通过配置 json 的方式进行 Spark 数据同步。 - -## 2. 支持的类型 - -目前支持的类型 -```text -jdbc、file、redis、kafka、elasticsearch、mongo、datalake(hudi、delta) -``` - -## 3. 通用配置说明 -```text -name: 数据源名称 -type: 包含`source`、`transformation`、`sink`,分别对应输入、转换、输出 -options: 配置参数 -saveMode: 保存模式,目前支持:`overwrite`和`append` -path: 文件路径,可以是: 'file://' or 'hdfs://'(default) -`resultTable`需要和`sourceTable`对应 -``` - -## 4. 使用说明 - -### 4.1 添加所需的 jar 包 -使用数据源时需要将对应的 spark connector jar 上传至 spark/jars目录,目录位置 $SPARK_HOME/jars - -spark connector jar 可以通过以下命令获取 - -```text -git clone https://github.com/apache/linkis.git - -cd linkis - -git checkout master - -cd linkis-engineconn-plugins/spark/scala-2.12 - -mvn clean install -Dmaven.test.skip=true -``` - -编译完成的spark connector jar位于以下目录中 -```text -linkis/linkis-engineconn-plugins/spark/scala-2.12/target/out/spark/dist/3.2.1/lib -``` - -### 4.2 linkis-cli 提交任务示例 - -在 code 传入具体的 json 代码即可,注意引号格式转换。 - -```shell -sh /appcom/Install/linkis/bin/linkis-cli -engineType spark-3.2.1 -codeType data_calc -code "" -submitUser hadoop -proxyUser hadoop -``` - -linkis-cli 提交 redis 数据同步任务示例 -```shell -sh ./bin/linkis-cli -engineType spark-3.2.1 -codeType data_calc -code "{\"plugins\":[{\"name\":\"file\",\"type\":\"source\",\"config\":{\"resultTable\":\"test\",\"path\":\"hdfs://linkishdfs/tmp/linkis/spark_etl_test/etltest.dolphin\",\"serializer\":\"csv\",\"options\":{\"header\":\"true\",\"delimiter\":\";\"},\"columnNames\":[\"name\",\"age\"]}},{\"name\":\"redis\",\"type\":\"sink\",\"config\":{\"sourceTable\":\"test\",\"host\":\"wds07\",\"port\":\"6679\",\"auth\":\"password\",\"targetTable\":\"spark_etl_test\",\"saveMode\":\"append\"}}]}" -submitUser hadoop -proxyUser hadoop -``` -### 4.3 各数据源同步 json 脚本说明 - -#### 4.3.1 jdbc - -配置说明 -```text -url: jdbc连接信息 -user: 用户名称 -password: 密码 -query: sql查询语句 -``` - -json code - -```json -{ - "sources": [ - { - "name": "jdbc", - "type": "source", - "config": { - "resultTable": "test1", - "url": "jdbc:mysql://127.0.0.1:3306/dip_linkis?characterEncoding=UTF-8", - "driver": "com.mysql.jdbc.Driver", - "user": "root", - "password": "123456", - "query": "select * from dip_linkis.linkis_ps_udf_baseinfo", - "options": { - } - } - } - ], - "transformations": [ - { - "name": "sql", - "type": "transformation", - "config": { - "resultTable": "T1654611700631", - "sql": "select * from test1" - } - } - ], - "sinks": [ - { - "name": "jdbc", - "type": "sink", - "config": { - "sourceTable": "T1654611700631", - "url": "jdbc:mysql://127.0.0.1:3306/dip_linkis?characterEncoding=UTF-8", - "driver": "com.mysql.jdbc.Driver", - "user": "root", - "password": "123456", - "targetTable": "linkis_ps_udf_baseinfo2", - "options": { - } - } - } - ] -} -``` - -需要新增的jar,根据具体使用的数据源选择对应的 jar -```text -DmJdbcDriver18.jar -kingbase8-8.6.0.jar -postgresql-42.3.8.jar -``` - -#### 4.3.2 file - -配置说明 - -```text -serializer: 文件格式,可以是`csv`、`parquet`等 -columnNames: 列名 -``` - - -json code - -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "test2", - "path": "hdfs:///tmp/test_new_no_partition", - "serializer": "csv", - "columnNames": ["id", "create_user", "udf_name", "udf_type", "tree_id", "create_time", "update_time", "sys", "cluster_name", "is_expire", "is_shared"] - } - } - ], - "sinks": [ - { - "name": "file", - "config": { - "sourceTable": "test2", - "path": "hdfs:///tmp/test_new", - "partitionBy": ["create_user"], - "saveMode": "overwrite", - "serializer": "csv" - } - } - ] -} -``` - -需要新增的 jar -``` -spark-excel-2.12.17-3.2.2_2.12-3.2.2_0.18.1.jar -``` - -#### 4.3.3 redis - -```text -sourceTable: 源表, -host: ip地址, -port": 端口, -auth": 密码, -targetTable: 目标表, -saveMode: 支持 append -``` - -json code -```json -{ - "plugins":[ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "test", - "path": "hdfs://linkishdfs/tmp/linkis/spark_etl_test/etltest.dolphin", - "serializer": "csv", - "options": { - "header":"true", - "delimiter":";" - }, - "columnNames": ["name", "age"] - } - }, - { - "name": "redis", - "type":"sink", - "config": { - "sourceTable": "test", - "host": "wds07", - "port": "6679", - "auth":"password", - "targetTable":"spark_etl_test", - "saveMode": "append" - } - } - ] -} -``` - -需要新增的jar -```text -jedis-3.2.0.jar -commons-pool2-2.8.1.jar -spark-redis_2.12-2.6.0.jar -``` - -#### 4.3.4 kafka - -配置说明 -```text -servers: kafka连接信息 -mode: 目前支持`batch`和`stream` -topic: kafka topic名称 -``` - -数据写入 json code -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "path": "file://{filePath}/etltest.dolphin", - "serializer": "csv", - "options": { - "header":"true", - "delimiter":";" - }, - "columnNames": ["name", "age"] - } - } - ], - "sinks": [ - { - "name": "kafka", - "config": { - "sourceTable": "T1654611700631", - "servers": "localhost:9092", - "mode": "batch", - "topic": "test121212" - } - } - ] -} -``` - -数据读取 json code -```json -{ - "sources": [ - { - "name": "kafka", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "servers": "localhost:9092", - "topic": "test121212" - } - } - ], - "sinks": [ - { - "name": "kafka", - "config": { - "sourceTable": "T1654611700631", - "servers": "localhost:9092", - "mode": "stream", - "topic": "test55555" - } - } - ] -} -``` - -需要新增的 jar -``` -kafka-clients-2.8.0.jar -spark-sql-kafka-0-10_2.12-3.2.1.jar -spark-token-provider-kafka-0-10_2.12-3.2.1.jar -``` - -#### 4.3.5 elasticsearch - -配置说明 -```text -node: elasticsearch ip -port: elasticsearch port -index: elasticsearch索引名称 -``` - - -数据写入 json code -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "path": "file://{filePath}/etltest.dolphin", - "serializer": "csv", - "options": { - "header":"true", - "delimiter":";" - }, - "columnNames": ["name", "age"] - } - } - ], - "sinks": [ - { - "name": "elasticsearch", - "config": { - "sourceTable": "T1654611700631", - "node": "localhost", - "port": "9200", - "index": "estest", - "saveMode": "overwrite" - } - } - ] -} -``` - -数据读取 json code -```json -{ - "sources": [ - { - "name": "elasticsearch", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "node": "localhost", - "port": "9200", - "index": "estest" - } - } - ], - "sinks": [ - { - "name": "file", - "config": { - "sourceTable": "T1654611700631", - "path": "file://{filePath}/csv", - "saveMode": "overwrite", - "serializer": "csv" - } - } - ] -} -``` - -需要新增的jar -``` -elasticsearch-spark-30_2.12-7.17.7.jar -``` - -#### 4.3.6 mongo - -配置说明 -```text -uri: mongo连接信息 -database: mongo database -collection: mongo collection -``` - - -数据写入 json code -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "path": "file://{filePath}/etltest.dolphin", - "serializer": "csv", - "options": { - "header":"true", - "delimiter":";" - }, - "columnNames": ["name", "age"] - } - } - ], - "sinks": [ - { - "name": "mongo", - "config": { - "sourceTable": "T1654611700631", - "uri": "mongodb://localhost:27017/test", - "database": "test", - "collection": "test", - "saveMode": "overwrite" - } - } - ] -} -``` - -数据读取 json code -```json -{ - "sources": [ - { - "name": "mongo", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "uri": "mongodb://localhost:27017/test", - "database": "test", - "collection": "test" - } - } - ], - "sinks": [ - { - "name": "file", - "config": { - "sourceTable": "T1654611700631", - "path": "file://{filePath}/json", - "saveMode": "overwrite", - "serializer": "json" - } - } - ] -} -``` - -需要新增的 jar -``` -bson-3.12.8.jar -mongo-spark-connector_2.12-3.0.1.jar -mongodb-driver-core-3.12.8.jar -mongodb-driver-sync-3.12.8.jar -``` - -#### 4.3.7 delta - -配置说明 -```text -tableFormat: 目前支持`hudi`和`delta` -``` - - -数据写入 json code -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "path": "file://{filePath}/etltest.dolphin", - "serializer": "csv", - "options": { - "header":"true", - "delimiter":";" - }, - "columnNames": ["name", "age"] - } - } - ], - "sinks": [ - { - "name": "datalake", - "config": { - "sourceTable": "T1654611700631", - "tableFormat": "delta", - "path": "file://{filePath}/delta", - "saveMode": "overwrite" - } - } - ] -} -``` - -数据读取 json code -```json -{ - "sources": [ - { - "name": "datalake", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "tableFormat": "delta", - "path": "file://{filePath}/delta", - } - } - ], - "sinks": [ - { - "name": "file", - "config": { - "sourceTable": "T1654611700631", - "path": "file://{filePath}/csv", - "saveMode": "overwrite", - "options": { - "header":"true" - }, - "serializer": "csv" - } - } - ] -} -``` - -需要新增的 jar -``` -delta-core_2.12-2.0.2.jar -delta-storage-2.0.2.jar -``` - -#### 4.3.8 hudi - -配置说明 -```text -tableFormat: 目前支持`hudi`和`delta` -``` - - -数据写入 json code -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "path": "file://{filePath}/etltest.dolphin", - "serializer": "csv", - "options": { - "header":"true", - "delimiter":";" - }, - "columnNames": ["name", "age"] - } - } - ], - "transformations": [ - { - "name": "sql", - "type": "transformation", - "config": { - "resultTable": "T111", - "sql": "select * from T1654611700631" - } - } - ], - "sinks": [ - { - "name": "datalake", - "config": { - "sourceTable": "T1654611700631", - "tableFormat": "hudi", - "options": { - "hoodie.table.name":"huditest", - "hoodie.datasource.write.recordkey.field":"age", - "hoodie.datasource.write.precombine.field":"age" - }, - "path": "file://{filePath}/hudi", - "saveMode": "append" - } - } - ] -} -``` - -数据读取 json code -```json -{ - "sources": [ - { - "name": "datalake", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "tableFormat": "hudi", - "path": "file://{filePath}/hudi", - } - } - ], - "transformations": [ - { - "name": "sql", - "type": "transformation", - "config": { - "resultTable": "T111", - "sql": "select * from T1654611700631" - } - } - ], - "sinks": [ - { - "name": "file", - "config": { - "sourceTable": "T1654611700631", - "path": "file://{filePath}/csv", - "saveMode": "overwrite", - "options": { - "header":"true" - }, - "serializer": "csv" - } - } - ] -} -``` - -需要新增的 jar -``` -hudi-spark3.2-bundle_2.12-0.13.0.jar -``` diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/_category_.json b/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/_category_.json deleted file mode 100644 index c862e6a30c2..00000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/_category_.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "label": "版本特性", - "position": 1.5 -} \ No newline at end of file diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/base-engine-compatibilty.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/base-engine-compatibilty.md deleted file mode 100644 index 83229cbe6b4..00000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/base-engine-compatibilty.md +++ /dev/null @@ -1,75 +0,0 @@ ---- -title: 基础引擎依赖性、兼容性、默认版本优化 -sidebar_position: 0.2 ---- - -## 1. 需求背景 -1. 低版本 linkis 需要通过修改代码来适配不同的 Hive、Spark 等版本,因为兼容性问题,编译可能会失败,可以减少这些基础引擎的兼容性问题。 -2. Hadoop、Hive、Spark 3.x 已经很成熟,并且低版本的引擎可能有潜在的风险点,社区很多用户默认使用 3.x 版本,因此考虑将 Linkis 默认编译的版本修改为 3.x 。 - -## 2. 使用说明 - -## 2.1 默认版本调整说明 - -Linkis 1.4.0 将 Hadoop、Hive、Spark 默认版本修改为 3.x,具体版本分别为 Hadoop 3.3.4、Hive 3.1.3、Spark 3.2.1 。 - -## 2.2 不同版本适配 - -不同的hive版本的编译,我们只需要指定`-D=xxx`就可以了,比如: -``` -mvn clean install package -Dhive.version=2.3.3 -``` -不同版本的spark编译,我们也只需要指定`-D=xxx`就可以了,常用的使用场景如下: -``` -#spark3+hadoop3 -mvn install package - -#spark3+hadoop2 -mvn install package -Phadoop-2.7 - -#spark2+hadoop2 -mvn install package -Pspark-2.4 -Phadoop-2.7 - -#spark2+ hadoop3 -mvn install package -Pspark-2.4 -``` -## 3. 注意事项 -1. 默认版本编译时,基础版本为:hadoop3.3.4 + hive3.1.3 + spark3.2.1 -``` -mvn install package -``` -由于默认基础引擎的默认版本升级,`spark-3.2`、`hadoop-3.3`和`spark-2.4-hadoop-3.3` profile被移除,新增profile `hadoop-2.7` and `spark-2.4`。 - -2. spark的子版本可以通过`-Dspark.version=xxx` 来指定,系统默认使用的 scala 版本为 2.12.17,可适配 spark 3.x 版本 。如需编译 spark 2.x,需要使用 scala 2.11 版本。可通过 -Pspark-2.4 参数,或者 -Dspark.version=2.xx -Dscala.version=2.11.12 -Dscala.binary.version=2.11 编译。 - -3. hadoop的子版本可以通过`-Dhadoop.version=xxx` 来指定 - -举个例子 : -``` -mvn install package -Pspark-3.2 -Phadoop-3.3 -Dspark.version=3.1.3 -``` - -4. hive 2.x 版本需要依赖 jersey,hive EC 默认编译时未添加 jersey依赖,可通过如下指引编译。 - -**编译 hive 2.3.3 版本** - -编译 hive EC 时默认添加了指定 2.3.3 版本时激活添加 jersey 依赖的 profile,用户可通过指定 -Dhive.version=2.3.3 参数编译 - -**编译其它 hive 2.x 版本** - -修改 linkis-engineconn-plugins/hive/pom.xml 文件,将 2.3.3 修改为用户编译版本,如 2.1.0 -```xml - - hive-jersey-dependencies - - - hive.version - - 2.1.0 - - - ... - -``` -编译时添加 -Dhive.version=2.1.0 参数。 - diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/datasource-generate-sql.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/datasource-generate-sql.md deleted file mode 100644 index 1750237fdee..00000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/datasource-generate-sql.md +++ /dev/null @@ -1,149 +0,0 @@ ---- -title: 根据数据源生成SQL -sidebar_position: 0.5 ---- - -## 1. 背景 -根据数据源信息生成 SparkSQL 和 JdbcSQL,包含DDL、DML、DQL。 - -## 2. 使用说明 -### 生成SparkSQL - -接口地址:/api/rest_j/v1/metadataQuery/getSparkSql - -请求方式:GET - -请求数据类型:application/x-www-form-urlencoded - -请求参数: - -| 参数名 | 说明 | 是否必须 | 数据类型 | -|------------------------------|-------|-----|--| -| `dataSourceName` | 数据源名称 | 是 | String | -| `system` | 系统名称 | 是 | String | -| `database` | 数据库名称 | 是 | String | -| `table` | 表名称 | 是 | String | - -响应示例: - -```json -{ - "method": null, - "status": 0, - "message": "OK", - "data": { - "sparkSql": { - "ddl": "CREATE TEMPORARY TABLE test USING org.apache.spark.sql.jdbc OPTIONS ( url 'jdbc:mysql://localhost:3306/test', dbtable 'test', user 'root', password 'password')", - "dml": "INSERT INTO test SELECT * FROM ${resultTable}", - "dql": "SELECT id,name FROM test" - } - } -} -``` -目前支持jdbc、kafka、elasticsearch、mongo 数据源,可以根据SparkSQLDDL注册 spark table 进行查询 - -### 生成JdbcSQL - -接口地址:/api/rest_j/v1/metadataQuery/getJdbcSql - -请求方式:GET - -请求数据类型:application/x-www-form-urlencoded - -请求参数: - -| 参数名 | 说明 | 是否必须 | 数据类型 | -|------------------------------|-------|-----|--| -| `dataSourceName` | 数据源名称 | 是 | String | -| `system` | 系统名称 | 是 | String | -| `database` | 数据库名称 | 是 | String | -| `table` | 表名称 | 是 | String | - -响应示例: - -```json -{ - "method": null, - "status": 0, - "message": "OK", - "data": { - "jdbcSql": { - "ddl": "CREATE TABLE `test` (\n\t `id` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL COMMENT '列名是id',\n\t `name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL COMMENT '列名是name',\n\t PRIMARY KEY (`id`)\n\t) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci", - "dml": "INSERT INTO test SELECT * FROM ${resultTable}", - "dql": "SELECT id,name FROM test" - } - } -} -``` - -目前支持 JDBC 数据源,如:mysql、oracle、postgres等,JdbcSQLDDL可以用于前端展示。 - -## 3. 注意事项 -1. 需要先注册数据源 - -## 4. 实现原理 -### 生成SparkSQL实现原理 -定义DDL_SQL_TEMPLATE,获取数据源信息进行替换 -```java - public static final String JDBC_DDL_SQL_TEMPLATE = - "CREATE TEMPORARY TABLE %s " - + "USING org.apache.spark.sql.jdbc " - + "OPTIONS (" - + " url '%s'," - + " dbtable '%s'," - + " user '%s'," - + " password '%s'" - + ")"; -``` - -### 生成JdbcSQL实现原理 -根据表schema信息拼接DDL -```java -public String generateJdbcDdlSql(String database, String table) { - StringBuilder ddl = new StringBuilder(); - ddl.append("CREATE TABLE ").append(String.format("%s.%s", database, table)).append(" ("); - - try { - List < MetaColumnInfo > columns = getColumns(database, table); - if (CollectionUtils.isNotEmpty(columns)) { - for (MetaColumnInfo column: columns) { - ddl.append("\n\t").append(column.getName()).append(" ").append(column.getType()); - if (column.getLength() > 0) { - ddl.append("(").append(column.getLength()).append(")"); - } - if (!column.isNullable()) { - ddl.append(" NOT NULL"); - } - ddl.append(","); - } - String primaryKeys = - columns.stream() - .filter(MetaColumnInfo::isPrimaryKey) - .map(MetaColumnInfo::getName) - .collect(Collectors.joining(", ")); - if (StringUtils.isNotBlank(primaryKeys)) { - ddl.append(String.format("\n\tPRIMARY KEY (%s),", primaryKeys)); - } - ddl.deleteCharAt(ddl.length() - 1); - } - } catch (Exception e) { - LOG.warn("Fail to get Sql columns(获取字段列表失败)"); - } - - ddl.append("\n)"); - - return ddl.toString(); -} -``` - -部分数据源支持直接获取DDL - -**mysql** -```sql -SHOW CREATE TABLE 'table' -``` - -**oracle** -```sql -SELECT DBMS_METADATA.GET_DDL('TABLE', 'table', 'database') AS DDL FROM DUAL -``` \ No newline at end of file diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/hive-engine-support-concurrent.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/hive-engine-support-concurrent.md deleted file mode 100644 index a07ebba71c9..00000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/hive-engine-support-concurrent.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: hive engine支持并发,支持复用 -sidebar_position: 0.3 ---- - -## 1. 需求背景 -hiveEngineConn支持并发,减少启动hive引擎的资源消耗,提高引擎复用率。 - -## 2. 使用说明 -首先,在linkis-engineconn-plugins/hive/src/main/resources目录下修改linkis-engineconn.properties文件, -并将linkis.hive.engineconn.concurrent.support设置为true。 -``` -# 支持并行执行 -wds.linkis.engineconn.support.parallelism=true - -# 并发数限制,默认为 10 -linkis.hive.engineconn.concurrent.limit=10 -``` - -提交一个hive任务,当第一个任务完成后,再提交另一个任务。您可以看到hive引擎已被重用。 - -配置修改后重启 cg-linkismanager 服务,或通过 [引擎刷新接口](../api/http/linkis-cg-engineplugin-api/engineconn-plugin-refresh.md) 使配置生效。 -## 3. 注意事项 -1、等待第一个hive任务执行成功后,再提交第二个hive任务。初次同时提交多个任务可能由于暂无可用的 EC 导致启动多个 EC。 \ No newline at end of file diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/other.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/other.md deleted file mode 100644 index db4ddc1c284..00000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/other.md +++ /dev/null @@ -1,29 +0,0 @@ ---- -title: 其它特性说明 -sidebar_position: 0.6 ---- - -## 1. ECM 重启时不 kill EC -当ECM重新启动时,可以选择不杀死引擎,而是可以接管现有的存活引擎。使引擎连接管理器 (ECM) 服务无状态。 - -## 2. 移除 json4s 依赖 -spark 不同版本依赖不同的json4s版本,不利于spark多版本的支持,我们需要减少这个json4s依赖,从linkis中移除了json4s. -比如: spark2.4 需要json4s v3.5.3, spark3.2需要json4s v3.7.0-M11。 - -## 3. EngineConn模块定义依赖引擎版本 -引擎的版本定义默认在 `EngineConn`中,一旦相关版本变更,需要修改多处,我们可以把相关的版本定义统一放到顶层pom文件中。编译指定引擎模块时,需要在项目根目录编译,并使用`-pl`来编译具体的引擎模块,比如: -``` -mvn install package -pl linkis-engineconn-plugins/spark -Dspark.version=3.2.1 -``` -引擎的版本可以通过mvn编译-D参数来指定,比如 -Dspark.version=xxx 、 -Dpresto.version=0.235 -目前所有的底层引擎版本新都已经移到顶层pom文件中,编译指定引擎模块时,需要在项目根目录编译,并使用`-pl`来编译具体的引擎模块。 - -## 4. Linkis 主版本号修改说明 - -Linkis 从 1.3.2 版本后将不再按小版本升级,下一个版本为 1.4.0,再往后升级时版本号为1.5.0,1.6.0 以此类推。当遇到某个发布版本有重大缺陷需要修复时会拉取小版本修复缺陷,如 1.4.1 。 - - -## 5. LInkis 代码提交主分支说明 - -Linkis 1.3.2 及之前版本修改代码默认是合并到 dev 分支。实际上 Apache Linkis 的开发社区很活跃,对于新开发的需求或修复功能都会提交到 dev 分支,但是用户访问 Linkis 代码库的时候默认显示的是 master 分支。由于我们一个季度才会发布一个新版本,从 master 分支来看显得社区活跃的不高。因此我们决定从 1.4.0 版本开始,将开发者提交的代码默认合并到 master 分支。 - diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/overview.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/overview.md deleted file mode 100644 index 2755345ad75..00000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/overview.md +++ /dev/null @@ -1,33 +0,0 @@ ---- -title: 版本总览 -sidebar_position: 0.1 ---- - -- [基础引擎依赖性、兼容性、默认版本优化](./base-engine-compatibilty.md) -- [Hive 引擎连接器支持并发任务](./hive-engine-support-concurrent.md) -- [支持更多的数据源](../user-guide/datasource-manual#31-jdbc-数据源) -- [Spark ETL 功能增强](./spark-etl.md) -- [根据数据源生成SQL](./datasource-generate-sql.md) -- [linkis-storage 支持 S3 文件系统(实验版本)](../deployment/deploy-quick#343-s3-模式) -- [增加 postgresql 数据库支持(实验版本)](../deployment/deploy-quick#22-配置数据库信息) -- [增加 impala 引擎支持(实验版本)](../engine-usage/impala.md) -- [其它特性说明](./other.md) -- [版本的 Release-Notes](/download/release-notes-1.4.0) - -## 参数变化 - -| 模块名(服务名)| 类型 | 参数名 | 默认值 | 描述 | -| ----------- | ----- | -------------------------------------------------------- | ---------------- | ------------------------------------------------------- | -| mg-eureka | 新增 | eureka.instance.metadata-map.linkis.app.version | ${linkis.app.version} | Eureka元数据上报Linkis应用版本信息| -| mg-eureka | 新增 | eureka.instance.metadata-map.linkis.conf.version | 无 | Eureka元数据上报Linkis服务版本信息 | -| mg-eureka | 修改 | eureka.client.registry-fetch-interval-seconds | 8 | Eureka Client拉取服务注册信息间隔时间(秒) | -| mg-eureka | 新增 | eureka.instance.lease-renewal-interval-in-seconds | 4 | eureka client发送心跳给server端的频率(秒)| -| mg-eureka | 新增 | eureka.instance.lease-expiration-duration-in-seconds | 12 | eureka 等待下一次心跳的超时时间(秒)| -| EC-shell | 修改 | wds.linkis.engineconn.support.parallelism | true | 是否开启 shell 任务并行执行| -| EC-shell | 修改 | linkis.engineconn.shell.concurrent.limit | 15 | shell 任务并发数 | -| Entrance | 修改 | linkis.entrance.auto.clean.dirty.data.enable | true | 启动时是否清理脏数据 | - - - -## 数据库表变化 -详细见代码仓库(https://github.com/apache/linkis) 对应分支中的升级schema`db/upgrade/1.4.0_schema`文件 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/spark-etl.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/spark-etl.md deleted file mode 100644 index 6f486f4515a..00000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.5.0/feature/spark-etl.md +++ /dev/null @@ -1,642 +0,0 @@ ---- -title: 支持 spark ETL 数据同步 -sidebar_position: 0.4 ---- - -## 1. 背景 -使用 Spark ETL 功能,用户可以通过配置 json 的方式进行 Spark 数据同步。 - -## 2. 支持的类型 - -目前支持的类型 -```text -jdbc、file、redis、kafka、elasticsearch、mongo、datalake(hudi、delta) -``` - -## 3. 通用配置说明 -```text -name: 数据源名称 -type: 包含`source`、`transformation`、`sink`,分别对应输入、转换、输出 -options: 配置参数 -saveMode: 保存模式,目前支持:`overwrite`和`append` -path: 文件路径,可以是: 'file://' or 'hdfs://'(default) -`resultTable`需要和`sourceTable`对应 -``` - -## 4. 使用说明 - -### 4.1 添加所需的 jar 包 -使用数据源时需要将对应的 spark connector jar 上传至 spark/jars目录,目录位置 $SPARK_HOME/jars - -spark connector jar 可以通过以下命令获取 - -```text -git clone https://github.com/apache/linkis.git - -cd linkis - -git checkout master - -cd linkis-engineconn-plugins/spark/scala-2.12 - -mvn clean install -Dmaven.test.skip=true -``` - -编译完成的spark connector jar位于以下目录中 -```text -linkis/linkis-engineconn-plugins/spark/scala-2.12/target/out/spark/dist/3.2.1/lib -``` - -### 4.2 linkis-cli 提交任务示例 - -在 code 传入具体的 json 代码即可,注意引号格式转换。 - -```shell -sh /appcom/Install/linkis/bin/linkis-cli -engineType spark-3.2.1 -codeType data_calc -code "" -submitUser hadoop -proxyUser hadoop -``` - -linkis-cli 提交 redis 数据同步任务示例 -```shell -sh ./bin/linkis-cli -engineType spark-3.2.1 -codeType data_calc -code "{\"plugins\":[{\"name\":\"file\",\"type\":\"source\",\"config\":{\"resultTable\":\"test\",\"path\":\"hdfs://linkishdfs/tmp/linkis/spark_etl_test/etltest.dolphin\",\"serializer\":\"csv\",\"options\":{\"header\":\"true\",\"delimiter\":\";\"},\"columnNames\":[\"name\",\"age\"]}},{\"name\":\"redis\",\"type\":\"sink\",\"config\":{\"sourceTable\":\"test\",\"host\":\"wds07\",\"port\":\"6679\",\"auth\":\"password\",\"targetTable\":\"spark_etl_test\",\"saveMode\":\"append\"}}]}" -submitUser hadoop -proxyUser hadoop -``` -### 4.3 各数据源同步 json 脚本说明 - -#### 4.3.1 jdbc - -配置说明 -```text -url: jdbc连接信息 -user: 用户名称 -password: 密码 -query: sql查询语句 -``` - -json code - -```json -{ - "sources": [ - { - "name": "jdbc", - "type": "source", - "config": { - "resultTable": "test1", - "url": "jdbc:mysql://127.0.0.1:3306/dip_linkis?characterEncoding=UTF-8", - "driver": "com.mysql.jdbc.Driver", - "user": "root", - "password": "123456", - "query": "select * from dip_linkis.linkis_ps_udf_baseinfo", - "options": { - } - } - } - ], - "transformations": [ - { - "name": "sql", - "type": "transformation", - "config": { - "resultTable": "T1654611700631", - "sql": "select * from test1" - } - } - ], - "sinks": [ - { - "name": "jdbc", - "type": "sink", - "config": { - "sourceTable": "T1654611700631", - "url": "jdbc:mysql://127.0.0.1:3306/dip_linkis?characterEncoding=UTF-8", - "driver": "com.mysql.jdbc.Driver", - "user": "root", - "password": "123456", - "targetTable": "linkis_ps_udf_baseinfo2", - "options": { - } - } - } - ] -} -``` - -需要新增的jar,根据具体使用的数据源选择对应的 jar -```text -DmJdbcDriver18.jar -kingbase8-8.6.0.jar -postgresql-42.3.8.jar -``` - -#### 4.3.2 file - -配置说明 - -```text -serializer: 文件格式,可以是`csv`、`parquet`等 -columnNames: 列名 -``` - - -json code - -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "test2", - "path": "hdfs:///tmp/test_new_no_partition", - "serializer": "csv", - "columnNames": ["id", "create_user", "udf_name", "udf_type", "tree_id", "create_time", "update_time", "sys", "cluster_name", "is_expire", "is_shared"] - } - } - ], - "sinks": [ - { - "name": "file", - "config": { - "sourceTable": "test2", - "path": "hdfs:///tmp/test_new", - "partitionBy": ["create_user"], - "saveMode": "overwrite", - "serializer": "csv" - } - } - ] -} -``` - -需要新增的 jar -``` -spark-excel-2.12.17-3.2.2_2.12-3.2.2_0.18.1.jar -``` - -#### 4.3.3 redis - -```text -sourceTable: 源表, -host: ip地址, -port": 端口, -auth": 密码, -targetTable: 目标表, -saveMode: 支持 append -``` - -json code -```json -{ - "plugins":[ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "test", - "path": "hdfs://linkishdfs/tmp/linkis/spark_etl_test/etltest.dolphin", - "serializer": "csv", - "options": { - "header":"true", - "delimiter":";" - }, - "columnNames": ["name", "age"] - } - }, - { - "name": "redis", - "type":"sink", - "config": { - "sourceTable": "test", - "host": "wds07", - "port": "6679", - "auth":"password", - "targetTable":"spark_etl_test", - "saveMode": "append" - } - } - ] -} -``` - -需要新增的jar -```text -jedis-3.2.0.jar -commons-pool2-2.8.1.jar -spark-redis_2.12-2.6.0.jar -``` - -#### 4.3.4 kafka - -配置说明 -```text -servers: kafka连接信息 -mode: 目前支持`batch`和`stream` -topic: kafka topic名称 -``` - -数据写入 json code -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "path": "file://{filePath}/etltest.dolphin", - "serializer": "csv", - "options": { - "header":"true", - "delimiter":";" - }, - "columnNames": ["name", "age"] - } - } - ], - "sinks": [ - { - "name": "kafka", - "config": { - "sourceTable": "T1654611700631", - "servers": "localhost:9092", - "mode": "batch", - "topic": "test121212" - } - } - ] -} -``` - -数据读取 json code -```json -{ - "sources": [ - { - "name": "kafka", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "servers": "localhost:9092", - "topic": "test121212" - } - } - ], - "sinks": [ - { - "name": "kafka", - "config": { - "sourceTable": "T1654611700631", - "servers": "localhost:9092", - "mode": "stream", - "topic": "test55555" - } - } - ] -} -``` - -需要新增的 jar -``` -kafka-clients-2.8.0.jar -spark-sql-kafka-0-10_2.12-3.2.1.jar -spark-token-provider-kafka-0-10_2.12-3.2.1.jar -``` - -#### 4.3.5 elasticsearch - -配置说明 -```text -node: elasticsearch ip -port: elasticsearch port -index: elasticsearch索引名称 -``` - - -数据写入 json code -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "path": "file://{filePath}/etltest.dolphin", - "serializer": "csv", - "options": { - "header":"true", - "delimiter":";" - }, - "columnNames": ["name", "age"] - } - } - ], - "sinks": [ - { - "name": "elasticsearch", - "config": { - "sourceTable": "T1654611700631", - "node": "localhost", - "port": "9200", - "index": "estest", - "saveMode": "overwrite" - } - } - ] -} -``` - -数据读取 json code -```json -{ - "sources": [ - { - "name": "elasticsearch", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "node": "localhost", - "port": "9200", - "index": "estest" - } - } - ], - "sinks": [ - { - "name": "file", - "config": { - "sourceTable": "T1654611700631", - "path": "file://{filePath}/csv", - "saveMode": "overwrite", - "serializer": "csv" - } - } - ] -} -``` - -需要新增的jar -``` -elasticsearch-spark-30_2.12-7.17.7.jar -``` - -#### 4.3.6 mongo - -配置说明 -```text -uri: mongo连接信息 -database: mongo database -collection: mongo collection -``` - - -数据写入 json code -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "path": "file://{filePath}/etltest.dolphin", - "serializer": "csv", - "options": { - "header":"true", - "delimiter":";" - }, - "columnNames": ["name", "age"] - } - } - ], - "sinks": [ - { - "name": "mongo", - "config": { - "sourceTable": "T1654611700631", - "uri": "mongodb://localhost:27017/test", - "database": "test", - "collection": "test", - "saveMode": "overwrite" - } - } - ] -} -``` - -数据读取 json code -```json -{ - "sources": [ - { - "name": "mongo", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "uri": "mongodb://localhost:27017/test", - "database": "test", - "collection": "test" - } - } - ], - "sinks": [ - { - "name": "file", - "config": { - "sourceTable": "T1654611700631", - "path": "file://{filePath}/json", - "saveMode": "overwrite", - "serializer": "json" - } - } - ] -} -``` - -需要新增的 jar -``` -bson-3.12.8.jar -mongo-spark-connector_2.12-3.0.1.jar -mongodb-driver-core-3.12.8.jar -mongodb-driver-sync-3.12.8.jar -``` - -#### 4.3.7 delta - -配置说明 -```text -tableFormat: 目前支持`hudi`和`delta` -``` - - -数据写入 json code -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "path": "file://{filePath}/etltest.dolphin", - "serializer": "csv", - "options": { - "header":"true", - "delimiter":";" - }, - "columnNames": ["name", "age"] - } - } - ], - "sinks": [ - { - "name": "datalake", - "config": { - "sourceTable": "T1654611700631", - "tableFormat": "delta", - "path": "file://{filePath}/delta", - "saveMode": "overwrite" - } - } - ] -} -``` - -数据读取 json code -```json -{ - "sources": [ - { - "name": "datalake", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "tableFormat": "delta", - "path": "file://{filePath}/delta", - } - } - ], - "sinks": [ - { - "name": "file", - "config": { - "sourceTable": "T1654611700631", - "path": "file://{filePath}/csv", - "saveMode": "overwrite", - "options": { - "header":"true" - }, - "serializer": "csv" - } - } - ] -} -``` - -需要新增的 jar -``` -delta-core_2.12-2.0.2.jar -delta-storage-2.0.2.jar -``` - -#### 4.3.8 hudi - -配置说明 -```text -tableFormat: 目前支持`hudi`和`delta` -``` - - -数据写入 json code -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "path": "file://{filePath}/etltest.dolphin", - "serializer": "csv", - "options": { - "header":"true", - "delimiter":";" - }, - "columnNames": ["name", "age"] - } - } - ], - "transformations": [ - { - "name": "sql", - "type": "transformation", - "config": { - "resultTable": "T111", - "sql": "select * from T1654611700631" - } - } - ], - "sinks": [ - { - "name": "datalake", - "config": { - "sourceTable": "T1654611700631", - "tableFormat": "hudi", - "options": { - "hoodie.table.name":"huditest", - "hoodie.datasource.write.recordkey.field":"age", - "hoodie.datasource.write.precombine.field":"age" - }, - "path": "file://{filePath}/hudi", - "saveMode": "append" - } - } - ] -} -``` - -数据读取 json code -```json -{ - "sources": [ - { - "name": "datalake", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "tableFormat": "hudi", - "path": "file://{filePath}/hudi", - } - } - ], - "transformations": [ - { - "name": "sql", - "type": "transformation", - "config": { - "resultTable": "T111", - "sql": "select * from T1654611700631" - } - } - ], - "sinks": [ - { - "name": "file", - "config": { - "sourceTable": "T1654611700631", - "path": "file://{filePath}/csv", - "saveMode": "overwrite", - "options": { - "header":"true" - }, - "serializer": "csv" - } - } - ] -} -``` - -需要新增的 jar -``` -hudi-spark3.2-bundle_2.12-0.13.0.jar -``` diff --git a/versioned_docs/version-1.5.0/feature/_category_.json b/versioned_docs/version-1.5.0/feature/_category_.json deleted file mode 100644 index eb7c770c8e5..00000000000 --- a/versioned_docs/version-1.5.0/feature/_category_.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "label": "Version Feature", - "position": 1.5 -} \ No newline at end of file diff --git a/versioned_docs/version-1.5.0/feature/base-engine-compatibilty.md b/versioned_docs/version-1.5.0/feature/base-engine-compatibilty.md deleted file mode 100644 index 81062d30c43..00000000000 --- a/versioned_docs/version-1.5.0/feature/base-engine-compatibilty.md +++ /dev/null @@ -1,74 +0,0 @@ ---- -title: Base Engine Dependency, Compatibility, Default Version Optimization -sidebar_position: 0.2 ---- - -## 1. Requirement background -1. The lower version of linkis needs to modify the code to adapt to different versions of Hive, Spark, etc. Because of compatibility issues, the compilation may fail, which can reduce the compatibility issues of these basic engines. -2. Hadoop, Hive, and Spark 3.x are very mature, and lower versions of the engine may have potential risks. Many users in the community use the 3.x version by default, so consider changing the default compiled version of Linkis to 3.x. - -## 2. Instructions for use - -## 2.1 Default version adjustment instructions - -Linkis 1.4.0 changes the default versions of Hadoop, Hive, and Spark to 3.x, and the specific versions are Hadoop 3.3.4, Hive 3.1.3, and Spark 3.2.1. - -## 2.2 Different version adaptation - -To compile different hive versions, we only need to specify `-D=xxx`, for example: -``` -mvn clean install package -Dhive.version=2.3.3 -``` -To compile different versions of spark, we only need to specify `-D=xxx`. Common usage scenarios are as follows: -``` -#spark3+hadoop3 -mvn install package - -#spark3+hadoop2 -mvn install package -Phadoop-2.7 - -#spark2+hadoop2 -mvn install package -Pspark-2.4 -Phadoop-2.7 - -#spark2+ hadoop3 -mvn install package -Pspark-2.4 -``` -## 3. Precautions -1. When the default version is compiled, the basic version is: hadoop3.3.4 + hive3.1.3 + spark3.2.1 -``` -mvn install package -``` -Due to the default version upgrade of the default base engine, `spark-3.2`, `hadoop-3.3` and `spark-2.4-hadoop-3.3` profiles were removed, and profiles `hadoop-2.7` and `spark-2.4` were added. - -2. The sub-version of spark can be specified by `-Dspark.version=xxx`. The default scala version used by the system is 2.12.17, which can be adapted to spark 3.x version. To compile spark 2.x, you need to use scala 2.11 version. Can be compiled with -Pspark-2.4 parameter, or -Dspark.version=2.xx -Dscala.version=2.11.12 -Dscala.binary.version=2.11. - -3. The subversion of hadoop can be specified by `-Dhadoop.version=xxx` - -for example : -``` -mvn install package -Pspark-3.2 -Phadoop-3.3 -Dspark.version=3.1.3 -``` - -4. Version 2.x of hive needs to rely on jersey. Hive EC does not add jersey dependency when compiling by default. You can compile it through the following guidelines. - -**Compile hive version 2.3.3** - -When compiling hive EC, the profile that activates adding jersey dependencies when specifying version 2.3.3 is added by default. Users can compile by specifying the -Dhive.version=2.3.3 parameter - -**Compile other hive 2.x versions** - -Modify the linkis-engineconn-plugins/hive/pom.xml file, modify 2.3.3 to the user-compiled version, such as 2.1.0 -```xml - - hive-jersey-dependencies - - - hive.version - - 2.1.0 - - - ... - -``` -Add -Dhive.version=2.1.0 parameter when compiling. diff --git a/versioned_docs/version-1.5.0/feature/datasource-generate-sql.md b/versioned_docs/version-1.5.0/feature/datasource-generate-sql.md deleted file mode 100644 index e9b0ec5a341..00000000000 --- a/versioned_docs/version-1.5.0/feature/datasource-generate-sql.md +++ /dev/null @@ -1,149 +0,0 @@ ---- -title: Generate SQL according to the data source -sidebar_position: 0.5 ---- - -## 1. Background -Generate SparkSQL and JdbcSQL based on data source information, including DDL, DML, and DQL. - -## 2. Instructions for use -### generate SparkSQL - -Interface address: /api/rest_j/v1/metadataQuery/getSparkSql - -Request method: GET - -Request data type: application/x-www-form-urlencoded - -Request parameters: - -| Parameter name | Description | Required | Data type | -|-------------------------------|-------|-----|--| -| `dataSourceName` | data source name | is | String | -| `system` | system name | is | String | -| `database` | database name | is | String | -| `table` | table name | is | String | - -Example response: - -```json -{ - "method": null, - "status": 0, - "message": "OK", - "data": { - "sparkSql": { - "ddl": "CREATE TEMPORARY TABLE test USING org.apache.spark.sql.jdbc OPTIONS ( url 'jdbc:mysql://localhost:3306/test', dbtable 'test', user 'root', password 'password' )", - "dml": "INSERT INTO test SELECT * FROM ${resultTable}", - "dql": "SELECT id,name FROM test" - } - } -} -``` -Currently supports jdbc, kafka, elasticsearch, mongo data source, you can register spark table according to SparkSQLDDL for query - -### Generate JdbcSQL - -Interface address: /api/rest_j/v1/metadataQuery/getJdbcSql - -Request method: GET - -Request data type: application/x-www-form-urlencoded - -Request parameters: - -| Parameter name | Description | Required | Data type | -|-------------------------------|-------|-----|--| -| `dataSourceName` | data source name | is | String | -| `system` | system name | is | String | -| `database` | database name | is | String | -| `table` | table name | is | String | - -Example response: - -```json -{ - "method": null, - "status": 0, - "message": "OK", - "data": { - "jdbcSql": { - "ddl": "CREATE TABLE `test` (\n\t `id` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL COMMENT 'The column name is id',\n\t `name` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL COMMENT 'The column name is name',\n\t PRIMARY KEY (`id`)\n\t) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci", - "dml": "INSERT INTO test SELECT * FROM ${resultTable}", - "dql": "SELECT id,name FROM test" - } - } -} -``` - -Currently supports JDBC data sources, such as: mysql, oracle, postgres, etc. JdbcSQLDDL can be used for front-end display. - -## 3. Precautions -1. You need to register the data source first - -## 4. Implementation principle -### Generate SparkSQL implementation principle -Define DDL_SQL_TEMPLATE to obtain data source information for replacement -```java - public static final String JDBC_DDL_SQL_TEMPLATE = - "CREATE TEMPORARY TABLE %s" - + "USING org.apache.spark.sql.jdbc" - + "OPTIONS (" - + "url '%s'," - + "dbtable '%s'," - + " user '%s'," - + "password '%s'" - + ")"; -``` - -### Generate JdbcSQL implementation principle -Splicing DDL according to table schema information -```java -public String generateJdbcDdlSql(String database, String table) { - StringBuilder ddl = new StringBuilder(); - ddl.append("CREATE TABLE ").append(String.format("%s.%s", database, table)).append(" ("); - - try { - List < MetaColumnInfo > columns = getColumns(database, table); - if (CollectionUtils. isNotEmpty(columns)) { - for (MetaColumnInfo column: columns) { - ddl.append("\n\t").append(column.getName()).append(" ").append(column.getType()); - if (column. getLength() > 0) { - ddl.append("(").append(column.getLength()).append(")"); - } - if (!column. isNullable()) { - ddl.append("NOT NULL"); - } - ddl.append(","); - } - String primaryKeys = - columns. stream() - .filter(MetaColumnInfo::isPrimaryKey) - .map(MetaColumnInfo::getName) - .collect(Collectors.joining(", ")); - if (StringUtils. isNotBlank(primaryKeys)) { - ddl.append(String.format("\n\tPRIMARY KEY (%s),", primaryKeys)); - } - ddl. deleteCharAt(ddl. length() - 1); - } - } catch (Exception e) { - LOG.warn("Fail to get Sql columns(Failed to get the field list)"); - } - - ddl.append("\n)"); - - return ddl. toString(); -} -``` - -Some data sources support direct access to DDL - -**mysql** -```sql -SHOW CREATE TABLE 'table' -``` - -**oracle** -```sql -SELECT DBMS_METADATA.GET_DDL('TABLE', 'table', 'database') AS DDL FROM DUAL -``` \ No newline at end of file diff --git a/versioned_docs/version-1.5.0/feature/hive-engine-support-concurrent.md b/versioned_docs/version-1.5.0/feature/hive-engine-support-concurrent.md deleted file mode 100644 index fc19a66f916..00000000000 --- a/versioned_docs/version-1.5.0/feature/hive-engine-support-concurrent.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: hive engine supports concurrency and multiplexing -sidebar_position: 0.3 ---- - -## 1. Requirement background -hiveEngineConn supports concurrency, reduces the resource consumption of starting the hive engine, and improves the engine reuse rate. - -## 2. Instructions for use -First, modify the linkis-engineconn.properties file in the linkis-engineconn-plugins/hive/src/main/resources directory, -And set linkis.hive.engineconn.concurrent.support to true. -``` -# Support parallel execution -wds.linkis.engineconn.support.parallelism=true - -# Concurrency limit, the default is 10 -linkis.hive.engineconn.concurrent.limit=10 -``` - -Submit a hive job, and when the first job is complete, submit another job. You can see that the hive engine has been reused. - -Restart the cg-linkismanager service after configuration modification, or make the configuration take effect through [Engine Refresh API](../api/http/linkis-cg-engineplugin-api/engineconn-plugin-refresh.md). -## 3. Precautions -1. Wait for the first hive task to execute successfully before submitting the second hive task. Submitting multiple tasks at the same time for the first time may cause multiple ECs to be started due to no available ECs. \ No newline at end of file diff --git a/versioned_docs/version-1.5.0/feature/other.md b/versioned_docs/version-1.5.0/feature/other.md deleted file mode 100644 index aeb42806873..00000000000 --- a/versioned_docs/version-1.5.0/feature/other.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -title: Description of other features -sidebar_position: 0.6 ---- - -## 1. Do not kill EC when ECM restarts -When the ECM restarts, there is an option not to kill the engine, but to take over the existing surviving engine. Makes the Engine Connection Manager (ECM) service stateless. - -## 2. Remove json4s dependency -Different versions of spark depend on different json4s versions, which is not conducive to the support of multiple versions of spark. We need to reduce this json4s dependency and remove json4s from linkis. -For example: spark2.4 needs json4s v3.5.3, spark3.2 needs json4s v3.7.0-M11. - -## 3. EngineConn module definition depends on engine version -The version definition of the engine is in `EngineConn` by default. Once the relevant version is changed, it needs to be modified in many places. We can put the relevant version definition in the top-level pom file. When compiling a specified engine module, it needs to be compiled in the project root directory, and use `-pl` to compile the specific engine module, for example: -``` -mvn install package -pl linkis-engineconn-plugins/spark -Dspark.version=3.2.1 -``` -The version of the engine can be specified by the -D parameter of mvn compilation, such as -Dspark.version=xxx, -Dpresto.version=0.235 -At present, all the underlying engine versions have been moved to the top-level pom file. When compiling the specified engine module, it needs to be compiled in the project root directory, and `-pl` is used to compile the specific engine module. - -## 4. Linkis main version number modification instructions - -Linkis will no longer be upgraded by minor version after version 1.3.2. The next version will be 1.4.0, and the version number will be 1.5.0, 1.6.0 and so on. When encountering a major defect in a released version that needs to be fixed, it will pull a minor version to fix the defect, such as 1.4.1. - - -## 5. LInkis code submission main branch instructions - -The modified code of Linkis 1.3.2 and earlier versions is merged into the dev branch by default. In fact, the development community of Apache Linkis is very active, and new development requirements or repair functions will be submitted to the dev branch, but when users visit the Linkis code base, the master branch is displayed by default. Since we only release a new version every quarter, it seems that the community is not very active from the perspective of the master branch. Therefore, we decided to merge the code submitted by developers into the master branch by default starting from version 1.4.0. diff --git a/versioned_docs/version-1.5.0/feature/overview.md b/versioned_docs/version-1.5.0/feature/overview.md deleted file mode 100644 index 34160261e3a..00000000000 --- a/versioned_docs/version-1.5.0/feature/overview.md +++ /dev/null @@ -1,33 +0,0 @@ ---- -title: version overview -sidebar_position: 0.1 ---- - -- [Base engine dependencies, compatibility, default version optimization](./base-engine-compatibilty.md) -- [Hive engine connector supports concurrent tasks](./hive-engine-support-concurrent.md) -- [Support more datasources](../user-guide/datasource-manual#31-jdbc-datasource) -- [Spark ETL enhancements](./spark-etl.md) -- [Generate SQL from data source](./datasource-generate-sql.md) -- [linkis-storage supports S3 file system (experimental version)](../deployment/deploy-quick#343-s3-mode) -- [add postgresql database support (experimental version)](../deployment/deploy-quick#22-configuration database information) -- [Add impala engine support (experimental version)](../engine-usage/impala.md) -- [Other feature description](./other.md) -- [version of Release-Notes](/download/release-notes-1.4.0) - -## Parameter changes - -| module name (service name) | type | parameter name | default value | description | -| ----------- | ----- | ------------------------------- ------------------------- | ---------------- | ------- --------------------------------------------------- | -| mg-eureka | New | eureka.instance.metadata-map.linkis.app.version | ${linkis.app.version} | Eureka metadata reports Linkis application version information| -| mg-eureka | Add | eureka.instance.metadata-map.linkis.conf.version | None | Eureka metadata report Linkis service version information | -| mg-eureka | Modify | eureka.client.registry-fetch-interval-seconds | 8 | Eureka Client pull service registration information interval (seconds) | -| mg-eureka | New | eureka.instance.lease-renewal-interval-in-seconds | 4 | The frequency (seconds) at which the eureka client sends heartbeats to the server | -| mg-eureka | new | eureka.instance.lease-expiration-duration-in-seconds | 12 | eureka waits for the next heartbeat timeout (seconds)| -| EC-shell | Modify | wds.linkis.engineconn.support.parallelism | true | Whether to enable parallel execution of shell tasks | -| EC-shell | Modify | linkis.engineconn.shell.concurrent.limit | 15 | Concurrent number of shell tasks | -| Entrance | Modify | linkis.entrance.auto.clean.dirty.data.enable | true | Whether to clean dirty data at startup | - - - -## Database table changes -For details, see the upgrade schema `db/upgrade/1.4.0_schema` file in the corresponding branch of the code warehouse (https://github.com/apache/linkis) \ No newline at end of file diff --git a/versioned_docs/version-1.5.0/feature/spark-etl.md b/versioned_docs/version-1.5.0/feature/spark-etl.md deleted file mode 100644 index 2151cb74cf9..00000000000 --- a/versioned_docs/version-1.5.0/feature/spark-etl.md +++ /dev/null @@ -1,642 +0,0 @@ ---- -title: Support spark ETL data synchronization -sidebar_position: 0.4 ---- - -## 1. Background -Using the Spark ETL function, users can synchronize Spark data by configuring json. - -## 2. Supported types - -currently supported types -```text -jdbc, file, redis, kafka, elasticsearch, mongo, datalake (hudi, delta) -``` - -## 3. General configuration instructions -```text -name: data source name -type: Contains `source`, `transformation`, `sink`, corresponding to input, transformation, and output respectively -options: configuration parameters -saveMode: save mode, currently supports: `overwrite` and `append` -path: file path, can be: 'file://' or 'hdfs://'(default) -`resultTable` needs to correspond to `sourceTable` -``` - -## 4. Instructions for use - -### 4.1 Add the required jar package -When using the data source, you need to upload the corresponding spark connector jar to the spark/jars directory, the directory location is $SPARK_HOME/jars - -The spark connector jar can be obtained by the following command - -```text -git clone https://github.com/apache/linkis.git - -cd link is - -git checkout master - -cd linkis-engineconn-plugins/spark/scala-2.12 - -mvn clean install -Dmaven.test.skip=true -``` - -The compiled spark connector jar is located in the following directory -```text -linkis/linkis-engineconn-plugins/spark/scala-2.12/target/out/spark/dist/3.2.1/lib -``` - -### 4.2 linkis-cli submit task example - -Just pass in the specific json code in code, pay attention to the conversion of quotation marks. - -```shell -sh /appcom/Install/linkis/bin/linkis-cli -engineType spark-3.2.1 -codeType data_calc -code "" -submitUser hadoop -proxyUser hadoop -``` - -Linkis-cli submits redis data synchronization task example -```shell -sh ./bin/linkis-cli -engineType spark-3.2.1 -codeType data_calc -code "{\"plugins\":[{\"name\":\"file\",\"type\":\" source\",\"config\":{\"resultTable\":\"test\",\"path\":\"hdfs://linkishdfs/tmp/linkis/spark_etl_test/etltest.dolphin\",\ "serializer\":\"csv\",\"options\":{\"header\":\"true\",\"delimiter\":\";\"},\"columnNames\":[ \"name\",\"age\"]}},{\"name\":\"redis\",\"type\":\"sink\",\"config\":{\"sourceTable \":\"test\",\"host\":\"wds07\",\"port\":\"6679\",\"auth\":\"password\",\"targetTable\" :\"spark_etl_test\",\"saveMode\":\"append\"}}]}" -submitUser hadoop -proxyUser hadoop -``` -### 4.3 Synchronization json script description of each data source - -#### 4.3.1 jdbc - -Configuration instructions -```text -url: jdbc connection information -user: user name -password: password -query: sql query statement -``` - -json code - -```json -{ - "sources": [ - { - "name": "jdbc", - "type": "source", - "config": { - "resultTable": "test1", - "url": "jdbc:mysql://127.0.0.1:3306/dip_linkis?characterEncoding=UTF-8", - "driver": "com.mysql.jdbc.Driver", - "user": "root", - "password": "123456", - "query": "select * from dip_linkis.linkis_ps_udf_baseinfo", - "options": { - } - } - } - ], - "transformations": [ - { - "name": "sql", - "type": "transformation", - "config": { - "resultTable": "T1654611700631", - "sql": "select * from test1" - } - } - ], - "sinks": [ - { - "name": "jdbc", - "type": "sink", - "config": { - "sourceTable": "T1654611700631", - "url": "jdbc:mysql://127.0.0.1:3306/dip_linkis?characterEncoding=UTF-8", - "driver": "com.mysql.jdbc.Driver", - "user": "root", - "password": "123456", - "targetTable": "linkis_ps_udf_baseinfo2", - "options": { - } - } - } - ] -} -``` - -A new jar needs to be added, and the corresponding jar should be selected according to the specific data source used -```text -DmJdbcDriver18.jar -kingbase8-8.6.0.jar -postgresql-42.3.8.jar -``` - -#### 4.3.2 file - -Configuration instructions - -```text -serializer: file format, can be `csv`, `parquet`, etc. -columnNames: column names -``` - - -json code - -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "test2", - "path": "hdfs:///tmp/test_new_no_partition", - "serializer": "csv", - "columnNames": ["id", "create_user", "udf_name", "udf_type", "tree_id", "create_time", "update_time", "sys", "cluster_name", "is_expire", "is_shared"] - } - } - ], - "sinks": [ - { - "name": "file", - "config": { - "sourceTable": "test2", - "path": "hdfs:///tmp/test_new", - "partitionBy": ["create_user"], - "saveMode": "overwrite", - "serializer": "csv" - } - } - ] -} -``` - -Need to add new jar -``` -spark-excel-2.12.17-3.2.2_2.12-3.2.2_0.18.1.jar -``` - -#### 4.3.3 redis - -```text -sourceTable: source table, -host: ip address, -port": port, -auth": password, -targetTable: target table, -saveMode: support append -``` - -json code -```json -{ - "plugins":[ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "test", - "path": "hdfs://linkishdfs/tmp/linkis/spark_etl_test/etltest.dolphin", - "serializer": "csv", - "options": { - "header": "true", - "delimiter": ";" - }, - "columnNames": ["name", "age"] - } - }, - { - "name": "redis", - "type": "sink", - "config": { - "sourceTable": "test", - "host": "wds07", - "port": "6679", - "auth": "password", - "targetTable": "spark_etl_test", - "saveMode": "append" - } - } - ] -} -``` - -Need to add new jar -```text -jedis-3.2.0.jar -commons-pool2-2.8.1.jar -spark-redis_2.12-2.6.0.jar -``` - -#### 4.3.4 kafka - -Configuration instructions -```text -servers: kafka connection information -mode: currently supports `batch` and `stream` -topic: kafka topic name -``` - -Data written to json code -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "path": "file://{filePath}/etltest.dolphin", - "serializer": "csv", - "options": { - "header": "true", - "delimiter": ";" - }, - "columnNames": ["name", "age"] - } - } - ], - "sinks": [ - { - "name": "kafka", - "config": { - "sourceTable": "T1654611700631", - "servers": "localhost:9092", - "mode": "batch", - "topic": "test121212" - } - } - ] -} -``` - -Data read json code -```json -{ - "sources": [ - { - "name": "kafka", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "servers": "localhost:9092", - "topic": "test121212" - } - } - ], - "sinks": [ - { - "name": "kafka", - "config": { - "sourceTable": "T1654611700631", - "servers": "localhost:9092", - "mode": "stream", - "topic": "test55555" - } - } - ] -} -``` - -Need to add new jar -``` -kafka-clients-2.8.0.jar -spark-sql-kafka-0-10_2.12-3.2.1.jar -spark-token-provider-kafka-0-10_2.12-3.2.1.jar -``` - -#### 4.3.5 elasticsearch - -Configuration instructions -```text -node: elasticsearch ip -port: elasticsearch port -index: elasticsearch index name -``` - - -Data written to json code -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "path": "file://{filePath}/etltest.dolphin", - "serializer": "csv", - "options": { - "header": "true", - "delimiter": ";" - }, - "columnNames": ["name", "age"] - } - } - ], - "sinks": [ - { - "name": "elasticsearch", - "config": { - "sourceTable": "T1654611700631", - "node": "localhost", - "port": "9200", - "index": "estest", - "saveMode": "overwrite" - } - } - ] -} -``` - -Data read json code -```json -{ - "sources": [ - { - "name": "elasticsearch", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "node": "localhost", - "port": "9200", - "index": "estest" - } - } - ], - "sinks": [ - { - "name": "file", - "config": { - "sourceTable": "T1654611700631", - "path": "file://{filePath}/csv", - "saveMode": "overwrite", - "serializer": "csv" - } - } - ] -} -``` - -Need to add new jar -``` -elasticsearch-spark-30_2.12-7.17.7.jar -``` - -#### 4.3.6 mongo - -Configuration instructions -```text -uri: mongo connection information -database: mongo database -collection: mongo collection -``` - - -Data written to json code -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "path": "file://{filePath}/etltest.dolphin", - "serializer": "csv", - "options": { - "header": "true", - "delimiter": ";" - }, - "columnNames": ["name", "age"] - } - } - ], - "sinks": [ - { - "name": "mongo", - "config": { - "sourceTable": "T1654611700631", - "uri": "mongodb://localhost:27017/test", - "database": "test", - "collection": "test", - "saveMode": "overwrite" - } - } - ] -} -``` - -Data read json code -```json -{ - "sources": [ - { - "name": "mongo", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "uri": "mongodb://localhost:27017/test", - "database": "test", - "collection": "test" - } - } - ], - "sinks": [ - { - "name": "file", - "config": { - "sourceTable": "T1654611700631", - "path": "file://{filePath}/json", - "saveMode": "overwrite", - "serializer": "json" - } - } - ] -} -``` - -Need to add new jar -``` -bson-3.12.8.jar -mongo-spark-connector_2.12-3.0.1.jar -mongodb-driver-core-3.12.8.jar -mongodb-driver-sync-3.12.8.jar -``` - -#### 4.3.7 delta - -Configuration instructions -```text -tableFormat: currently supports `hudi` and `delta` -``` - - -Data written to json code -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "path": "file://{filePath}/etltest.dolphin", - "serializer": "csv", - "options": { - "header": "true", - "delimiter": ";" - }, - "columnNames": ["name", "age"] - } - } - ], - "sinks": [ - { - "name": "datalake", - "config": { - "sourceTable": "T1654611700631", - "tableFormat": "delta", - "path": "file://{filePath}/delta", - "saveMode": "overwrite" - } - } - ] -} -``` - -Data read json code -```json -{ - "sources": [ - { - "name": "datalake", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "tableFormat": "delta", - "path": "file://{filePath}/delta", - } - } - ], - "sinks": [ - { - "name": "file", - "config": { - "sourceTable": "T1654611700631", - "path": "file://{filePath}/csv", - "saveMode": "overwrite", - "options": { - "header": "true" - }, - "serializer": "csv" - } - } - ] -} -``` - -Need to add new jar -``` -delta-core_2.12-2.0.2.jar -delta-storage-2.0.2.jar -``` - -#### 4.3.8 hudi - -Configuration instructions -```text -tableFormat: currently supports `hudi` and `delta` -``` - - -Data written to json code -```json -{ - "sources": [ - { - "name": "file", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "path": "file://{filePath}/etltest.dolphin", - "serializer": "csv", - "options": { - "header": "true", - "delimiter": ";" - }, - "columnNames": ["name", "age"] - } - } - ], - "transformations": [ - { - "name": "sql", - "type": "transformation", - "config": { - "resultTable": "T111", - "sql": "select * from T1654611700631" - } - } - ], - "sinks": [ - { - "name": "datalake", - "config": { - "sourceTable": "T1654611700631", - "tableFormat": "hudi", - "options": { - "hoodie.table.name": "huditest", - "hoodie.datasource.write.recordkey.field": "age", - "hoodie.datasource.write.precombine.field":"age" - }, - "path": "file://{filePath}/hudi", - "saveMode": "append" - } - } - ] -} -``` - -Data read json code -```json -{ - "sources": [ - { - "name": "datalake", - "type": "source", - "config": { - "resultTable": "T1654611700631", - "tableFormat": "hudi", - "path": "file://{filePath}/hudi", - } - } - ], - "transformations": [ - { - "name": "sql", - "type": "transformation", - "config": { - "resultTable": "T111", - "sql": "select * from T1654611700631" - } - } - ], - "sinks": [ - { - "name": "file", - "config": { - "sourceTable": "T1654611700631", - "path": "file://{filePath}/csv", - "saveMode": "overwrite", - "options": { - "header": "true" - }, - "serializer": "csv" - } - } - ] -} -``` - -Need to add new jar -``` -hudi-spark3.2-bundle_2.12-0.13.0.jar -``` \ No newline at end of file