diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..3c77562 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,20 @@ +# Top-most EditorConfig file +root = true + +# All files +[*] +charset = utf-8 +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true + +# Rust files +[*.rs] +indent_style = space +indent_size = 4 +max_line_length = 100 + +# TOML files (like Cargo.toml) +[*.toml] +indent_style = space +indent_size = 2 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..efd6d01 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,24 @@ +name: Cargo Build & Test + +on: + push: + pull_request: + +env: + CARGO_TERM_COLOR: always + +jobs: + build_and_test: + name: Rust project - latest + runs-on: ubuntu-latest + strategy: + matrix: + toolchain: + - stable + - beta + #- nightly + steps: + - uses: actions/checkout@v3 + - run: rustup update ${{ matrix.toolchain }} && rustup default ${{ matrix.toolchain }} + - run: cargo build --verbose + - run: cargo test --verbose diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000..93252fe --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,13 @@ +name: pre-commit + +on: + push: + pull_request: + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v3 + - uses: pre-commit/action@v3.0.0 diff --git a/.gitignore b/.gitignore index 3076773..94c7b4e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,19 +1,17 @@ -.metadata/ -*.cache -sandbox.services/ +# Apple's crap .DS_Store -*~ -sandbox.* -*.iml -*.ipr -*.iws -keystore -/.idea/ -.settings -.directory -target -*/.svn/* -**/.svn/* -**/bin/* -/.svn/ -net.modelbased.sensapp.backyard.weather/src/main/resources/ \ No newline at end of file + +# Generated by Cargo +# will have compiled files and executables +debug/ +target/ + +# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries +# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html +Cargo.lock + +# These are backup files generated by rustfmt +**/*.rs.bk + +# MSVC Windows builds of rustc generate these, which store debugging information +*.pdb diff --git a/.gitlint b/.gitlint new file mode 100644 index 0000000..f0e526a --- /dev/null +++ b/.gitlint @@ -0,0 +1,5 @@ +[general] +# Ignore rules, reference them by id or name (comma-separated) +ignore=body-is-missing,CC1 + +contrib=contrib-title-conventional-commits diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..9fb42b0 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,23 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: check-added-large-files + - id: check-case-conflict + - id: mixed-line-ending + - id: trailing-whitespace + - id: check-json + - id: check-yaml + - id: check-xml + - id: check-ast + - id: end-of-file-fixer + - id: check-merge-conflict + - id: no-commit-to-branch # no commits to main + - repo: https://github.com/gitleaks/gitleaks + rev: v8.18.1 + hooks: + - id: gitleaks + - repo: https://github.com/jorisroovers/gitlint + rev: v0.19.1 + hooks: + - id: gitlint diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..e4d2481 --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,13 @@ +{ + "recommendations": [ + "matklad.rust-analyzer", + "serayuzgur.crates", + "EditorConfig.EditorConfig", + "bungcip.better-toml", + "vadimcn.vscode-lldb", + "usernamehw.errorlens" + ], + "unwantedRecommendations": [ + "rust-lang.rust-vscode" + ] +} diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..b7107f7 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,42 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "type": "lldb", + "request": "launch", + "name": "Debug executable 'sensapp'", + "cargo": { + "args": [ + "build", + "--bin=sensapp", + "--package=sensapp" + ], + "filter": { + "name": "sensapp", + "kind": "bin" + } + }, + "args": [], + "cwd": "${workspaceFolder}" + }, + { + "type": "lldb", + "request": "launch", + "name": "Debug unit tests in executable 'sensapp'", + "cargo": { + "args": [ + "test", + "--no-run", + "--bin=sensapp", + "--package=sensapp" + ], + "filter": { + "name": "sensapp", + "kind": "bin" + } + }, + "args": [], + "cwd": "${workspaceFolder}" + } + ] +} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..171fb82 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,74 @@ +# Contributing + +Thank you for your contributions in making SensApp better! + +## Discuss Changes + +For significant changes, please open an issue first to discuss what you would like to change. This helps us align your contributions with our project goals. + +## Pre-commits hooks + +Please use pre-commits hooks to check your changes locally. Your change will also be checked by the continuous integration (CI) pipeline, but it is faster to catch errors locally before pushing your changes. + +``` +pip install pre-commit +pre-commit install +pre-commit install --hook-type commit-msg +``` + +## Language + +We use Rust. + +## Code Style + +We follow the [Rust Style Guide](https://github.com/rust-lang/rust/tree/HEAD/src/doc/style-guide/src). + +## Tests + +Update tests as appropriate. New features should come with additional tests. + +## Documentation + + **Documentation**: Update the `README.md` or other documentation with details of changes to the interface or additional features. + +## Issues + +Make sure you use the latest version of SensApp and please include enough information about the issue. The more information you provide, the easier it is to reproduce the issue and to fix it. + +## ~~Pull~~ Merge Requests + +You are not allowed to push to the `main` branch. Please create a merge request instead. + +Please do NOT `squash` your merge requests. You are welcome to organise and clean your commits first, but we want to keep the commit history. + +Avoid merging your own pull requests without a review. Do not merge a pull request with failing tests. + +## Commit Messages + +We use [Conventional Commits](https://www.conventionalcommits.org/). + +``` +[(optional scope)]: + +[optional body] + +[optional footer(s)] +``` + +Examples: + +``` +fix: prevent infinite loop when it rains +docs(architecture): correct spelling of banana +``` + +Valid types are: fix, feat, chore, docs, style, refactor, perf, test, revert, ci, and build. + +To please the [gitmoji](https://gitmoji.dev) enthousiasts, unicode emojis are allowed but not enforced. Commit messages with emojis still must respect the conventional commit format. + +Examples: +``` +fix: šŸ› prevent infinite loop when it rains +docs(architecture): šŸ“ correct spelling of banana +``` diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..d34821f --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "sensapp" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index 7a1828c..d11a545 100644 --- a/README.md +++ b/README.md @@ -1,42 +1,50 @@ -# SensApp in a Nutshell +# ![SensApp](./docs/sensapp_logo.png) -SensApp is a platform to support sensor based application. It is developed by -SINTEF (IKT division, NSS Department, MOD research group). +SensApp is an open-source sensor data platform developed by SINTEF for research purposes. It manages time-series data from a multitude of sensors. -As a basis, SensApp provides four essential services to support the definition of IoT applications. The Registry stores metadata about the sensors (e.g., description and creation date). The Database servive stores raw data from the sensors using a MongoDB database. The Notifier component sends notifications to third-party applications when relevant data are pushed (e.g., when new data collected by air quality sensors become available). The Dispatcher orchestrates the other components: it receives data from the sensors, stores these data in the Database according to the metadata from the Registry, and then triggers the notification mechanisms for the new data. Finally, the Admin web page provides capabilities to manage sensors and visualise data using a graphical user interface. In order to be deployed, SensApp requires a servlet container and a database, while the SensApp admin requires a servlet container only. +It enables the handling of small time series data of the edge efficiently to large-scale big data digital twins. -![alt tag](http://sensapp.org/img/sensapp_logo.png) +## Features -This repository is oriented to developers. End-users or business experts -should refer to the following webpage: http://sensapp.org +- **Flexible Time Series DataBase Storage**: Supports various time-series databases like SQLite, PostgreSQL (with optional TimeScaleDB plugin), and ClickHouse, with the potential to extend support to other databases in the future. +- **Data Lake Storage**: Supports Parquet files over S3 compatible object stores for long-term time-series data storage. +- **Multiple Data Ingestion Protocols**: Easy data ingestion via HTTP REST API, MQTT, AMQP, KAFKA, and NATS. +- **Compatibility with Existing Pipelines**: Offers Prometheus Remote Write and InfluxDB line format support for seamless integration into existing sensor data pipelines. +- **Data formats**: Supports various data formats like JSON, CSV, Parquet, or SenML. -## How to create a new SensApp Service? +## Architecture - * Run the maven tool from the command line +SensApp should be stateless and scale from the edge to big data. The message queue software and the database software solve the complex problems. SensApp is a simple adapter between. -
mvn archetype:generate
+* SensApp supports simple deployments without requiring a message queue and only an embedded SQLite database. +* SensApp supports medium deployments with a single message broker and a PostgreSQL database. +* For larger deployments, SensApp advises a distributed message queue, an automatic load balancer for the SensApp instances, and a ClickHouse cluster. - * Select the "service" archetype provided by SensApp, and fill in the blanks +Check the [ARCHITECTURE.md](docs/ARCHITECTURE.md) file for more details. -
[...]
-Choose a number or apply filter (format: [groupId:]artifactId, case sensitive contains): 186: sensapp
-Choose archetype:
-1: local -> net.modelbased.sensapp.archetype:net.modelbased.sensapp.archetype.service (A Prototypical SensApp Service, integrated with the others)
-2: local -> net.modelbased.sensapp.archetype:net.modelbased.sensapp.archetype.system (A Prototypical SensApp System, integrating Services)
-Choose a number or apply filter (format: [groupId:]artifactId, case sensitive contains): : 1
-Define value for property 'groupId': : net.modelbased.sensapp.service 
-Define value for property 'artifactId': : net.modelbased.sensapp.service.registry
-Define value for property 'version':  1.0-SNAPSHOT: : 0.0.1-SNAPSHOT
-Define value for property 'package':  net.modelbased.sensapp.service: : net.modelbased.sensapp.service.registry
-Confirm properties configuration:
-groupId: net.modelbased.sensapp.service
-artifactId: net.modelbased.sensapp.service.registry
-version: 0.0.1-SNAPSHOT
-package: net.modelbased.sensapp.service.registry
- Y: : Y
+## Built With Rustā„¢ļø - * Enjoy! +SensApp is developed using Rust, a language known for its performance, memory safety, and annoying borrow checker. SensApp used to be written in Scala, but the new author prefers Rust. +## Contributing -[![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/SINTEF-9012/sensapp/trend.png)](https://bitdeli.com/free "Bitdeli Badge") +We appreciate your interest in contributing to SensApp! Contributing is as simple as submitting an issue or a merge/pull request. Please read the [CONTRIBUTING.md](CONTRIBUTING.md) file for more details. +## License + +This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details. + +The SensApp software is provided "as is," with no warranties, and the creators of SensApp are not liable for any damages that may arise from its use. + +## You may not want to use it in production (yet) + +SensApp is currently under development. It is not yet ready for production. + +## Acknowledgments + +We thank [the historical authors of SensApp](https://github.com/SINTEF/sensapp/graphs/contributors) who created the first version a decade ago. + +SensApp is developed by +[SINTEF](https://www.sintef.no) ([Digital division](https://www.sintef.no/en/digital/), [Sustainable Communication Technologies department](https://www.sintef.no/en/digital/departments-new/department-of-sustainable-communication-technologies/), [Smart Data research group](https://www.sintef.no/en/expertise/digital/sustainable-communication-technologies/smart-data/)). + +We also thank the open-source community for all the tools they create and maintain that allow SensApp to exist. diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 0000000..8c3bf14 --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,139 @@ +# SensApp Architecture + +SensApp should be an accommodating platform to support sensor-based applications. + +It should be relatively small and simple to transform incoming sensor data, persist it, and make it available to other applications. + +## Simple SensApp Deployment Example +```mermaid +graph LR + Sensor1[Sensor 1] --> SensApp + Sensor2[Sensor 2] --> SensApp + Sensor3[Sensor 3] --> SensApp + Sensor4[Sensor 4] --> SensApp + SensApp <--> DB[Database] + + SensApp[SensApp] +``` + +## Advanced SensApp Deployment Example +```mermaid +graph LR + + subgraph Sensors + Sensor1(Sensor 1) + Sensor2(Sensor 2) + Sensor3(Sensor 3) + Sensor4(Sensor 4) + MoreSensors((...)) + end + + subgraph MQs["Distributed Message Queue"] + MQ1[MQ 1] + MQ2[MQ 2] + MQ3[MQ 3] + MoreMQ((...)) + end + + subgraph SensApp["SensApp Cluster"] + SensApp1[SensApp 1] + SensApp2[SensApp 2] + SensApp3[SensApp 3] + MoreSensApp((...)) + end + + subgraph DBs["Fancy Database Cluster"] + DB1[DB 1] + DB2[DB 2] + DB3[DB 3] + MoreDB((...)) + end + + Sensor1 --> MQs + Sensor2 --> MQs + Sensor3 --> MQs + Sensor4 --> MQs + MQs --> SensApp + MQs --> SensApp + MQs --> SensApp + SensApp <--> DBs + SensApp <--> DBs + SensApp <--> DBs +``` + +## Technology and programming language + +Previously developed in Scala, we rewrote SensApp in Rust. The main reason is that the new author prefers Rust over Scala or Golang. The second main reason is from the results from the paper [Energy efficiency across programming languages: how do energy, time, and memory relate?](https://dl.acm.org/doi/10.1145/3136014.3136031), which shows Rust as one of the most energy-efficient programming languages while having memory safety. + +SensApp uses the Rust Tokio runtime for asynchronous programming. + +## Internal Architecture + +SensApp uses an event-based architecture internally, with a message bus to communicate between components. The messages are lightweight, only internal, and do not rely on a network. + +## Incoming data streams + +SensApp should eventually support the following incoming data streams: + + - Stream-based protocols: + - MQTT + - AMQP + - Kafka + - Nats + - HTTP REST Push + - HTTPĀ Rest Pull + - InfluxDB Write API + - Prometheus Remote Write API + +## Supported incoming data formats + +Sensor data can come in many formats, and as an accommodating platform, SensApp should support the most common ones. + +CSV, JSON, SenML, and Parquet are supported. But more formats can be added whenever needed. + +We also support InfluxDB line protocol, and the Prometheus remote stores protocol to allow an easy transition from these platforms to SensApp. + +## Storage + +SensApp should support various storage backends. The best storage backend for time series has yet to exist. + + * For small deployments, SQLite is used. + * For medium deployments, PostgreSQL is used. + * It is optional to use the TimescaleDB plugin or Citus Columnar. + * For larger deployments, ClickHouse is used. + + * SensApp can also produce Parquet files stored in S3-compatible object stores. + +SensApp can use other storage backends in the future. Could it be Cassandra, Apache IoTDB, OpenTSDB, QuestDB, HoraeDB, or something new? + +We base our storage on the findings of the paper [TSM-Bench: Benchmarking Time Series Database Systems for Monitoring Applications](https://dl.acm.org/doi/abs/10.14778/3611479.3611532) that shows that ClickHouse is a better choice than most databases for time series at scale, at the moment. Unfortunately, The paper didn't include IoTDB, and the new author doesn't like the JVM much, so IoTDB support is not a priority. Other databases are relatively new, and we favour the most popular ones for now. + +SensApp also supports SQLite for small deployments and local persistence. The SQLite storage feature cannot scale to large deployments, but many deployments are small, and SQLite is a good choice for these. + +PostgreSQL is also supported as it is the most popular database according to the [StackOverflow developer Survey 2023](https://survey.stackoverflow.co/2023/) and should provide a good compromise between performance and convenience. The choice between Vanilla PostgreSQL tables, TimeScaleDB bucketstyle (hyper) tables, or Citus columnar tables is left to the user. + +Columnar storage with compression fits well with time series data, and a distributed Clickhouse cluster is the favoured choice for large deployments. + +SensApp used to rely on MongoDB, as it was created during the NoSQL hype, but the performances were very poor for this use case. + +## Scalability + +SensApp should be able to scale vertically and horizontally. However, the burden of horizontal scaling is left to other components outside SensApp. + +At scale, it is strongly advised to rely on the message queue ingestion pipelines when possible. + +The publisher should have a mechanism to automatically retry when SensAPP returns a 503 HTTP error because of a high load. + +SensApp should scale horizontally and not persist state on its own. It keeps relatively small buffers in memory to improve performances and relies on the storage backend to persist data. Publishers should consider the data as persisted once SensApp acknowledges it. + +The storage layer should scale as well. SQLite on a network filesystem could work, but using a distributed storage backend is more advisable when one single database instance isn't enough. + +It is essential to mention that horizontal scalability comes with a higher complexity and energy cost. Prefer vertical scalability when possible. In 2024, single database servers can handle high loads, with hundreds of cores, petabytes of storage, and terabytes of RAM. + +## Resilience + +SensApp should acknowledge the persistence of the incoming data once the storage backend has persisted it. If SensApp crashes or is shut down, the publisher should keep the data and wait for SensApp to return online. + +The publisher should favour the message queue ingestion pipeline if resilience is a concern. + +The storage backend and the message queue should be resilient. diff --git a/docs/DATAMODEL.md b/docs/DATAMODEL.md new file mode 100644 index 0000000..f82c8d2 --- /dev/null +++ b/docs/DATAMODEL.md @@ -0,0 +1,157 @@ +# SensApp Data Model + +The SensAPP data model is inspired by the [SenML proposed standard](https://www.rfc-editor.org/rfc/rfc8428), the [Prometheus remote write](https://prometheus.io/docs/concepts/remote_write_spec/) and [remote read](https://github.com/prometheus/prometheus/blob/main/prompb/types.proto) data models, the [InfluxDB line protocol](https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/), the [OpenMetrics](https://openmetrics.io) proposed standard, and [ThingModel](https://github.com/SINTEF-9012/ThingModel). + +A sensor has a unique name used to identify it. + +Each sensor contains only one type of data. If your sensor measures more than one thing, like temperature and humidity, it should be represented as two sensors in SensApp. + +Sensors can have labels attached to them, which are key-value pairs. Those are not tracked over time. They are used to add metadata to the sensors. If you wish to track labels, you should create a sensor for each label and use the sensor value to store the label value. + +SensApp distinguises between: + + - **Integer** values, which are 64 bits integers. + - **Float** values, which areĀ IEEE 754 64 bits floating point numbers. Those are **approximates** values, check the section about floating point numbers for more information. + - **Numeric** values, which are decimal numbers, that shouldn't be approximate values. This is not supported by SQLite 3, it was during the SQLite4 experimental project, but is supported by PostGreSQL and ClickHouse. + - **String** values, which are UTF-8 encoded strings. + - **Boolean** values, which are true or false. + - **Localisation** values, which are latitude and longitude coordinates, with an optional altitude. We consider earth as the center of the universe. _Do not_ use this type for space projects, and rely on multiple sensors instead. + + +```mermaid +erDiagram + + SENSORS { + String name UK "Name of the sensor" + UUID id PK "UUID (v7 by default) of the sensor" + TypeEnum type PK "The type of the sensor (integer, float, string, boolean), part of the primary key" + Serial unit FK "The unit of the sensor, for documentation purposes, if provided" + } + + UNITS { + Serial id PK + String name UK + String description + } + SENSORS }o--|| UNITS : "" + + LABELS { + UUID sensor PK + BigSerial named PK + BigSerial description FK + } + SENSORS ||--o{ LABELS : "" + + LABELS_NAME_DICTIONARY { + BigSerial id PK + String name UK + } + LABELS }o--|| LABELS_NAME_DICTIONARY : "has one" + LABELS_DESCRIPTION_DICTIONARY { + BigSerial id PK + String description UK + } + LABELS }o--|| LABELS_DESCRIPTION_DICTIONARY : "" + + + INTEGER_VALUES { + UUID sensor + DateTime datetime + Integer value + } + + NUMERIC_VALUES { + UUID sensor + DateTime datetime + Numeric value + } + + FLOAT_VALUES { + UUID sensor + DateTime datetime + Float value + } + + STRING_VALUES { + UUID sensor + DateTime datetime + BigSerial value FK + } + STRINGS_VALUES_DICTIONARY { + BigSerial id PK + String value UK + } + STRING_VALUES }o--|| STRINGS_VALUES_DICTIONARY : "" + + BOOLEAN_VALUES { + UUID sensor + DateTime datetime + Boolean value + } + + SENSORS ||--o{ STRING_VALUES : "" + SENSORS ||--o{ INTEGER_VALUES : "" + SENSORS ||--o{ NUMERIC_VALUES : "" + SENSORS ||--o{ FLOAT_VALUES : "" + SENSORS ||--o{ LOCALISATIONS : "" + SENSORS ||--o{ BOOLEAN_VALUES : "" + + %% Localisations are common enough to be part of the core data model + LOCALISATIONS { + UUID sensor + DateTime datetime + Float latitude + Float longitude + } +``` + +## Optimisations and Compression + +SensApp does not attempt to do optimisation and compression on the time series data itself. This is left to the storage backend. PostGreSQLĀ with the TimeScaleDB extension, ClickHouse, or Parquet files will optimise and compress the data pretty well. + +The SQLite storage backend will not compress the data or optimise it with advanced techniques like Gorilla. + +## Identifiers + +Identifiers should follow the UUID (Universal Unique Identifier) format, a 128bits number. It has a bit more overhead than the traditional 32 or 64 bits identifiers, but it is much more flexible and convenient. + +The most common type of UUID is the version 4, which is generated randomly. If the random generator is cryptographically safe, this makes it difficult to predict the other identifiers, thus mitigating the security risks associated with sequential integers. In distributed systems, there is no need for a central server to generate identifiers. The system only needs to generate a very big (122-bit) random number. The probability of a collision, generating randomly the same identifier more than once, is extremely low and not a concern in practice. The oceans would likely boil before a collision happens. + +However, most UUIDs cannot be sorted or indexed efficiently as related entries will have identifiers very far apart. Having entries created about the same time stored with similar identifiers can have benefits for SensApp. For example, if you want to retrieve the last 100 entries created, an index scan will be more efficient if the identifiers are close. Snowflake IDs, ULID, or UUID v7 are types of identifier that uses the current date first and some randomn bytes as well. It allows entries created about the same time to have identifiers close to each other while still being unique and difficult to predict. Snowflake also use the host name of the machine to avoid collisions, but using more random bits sounds like a better solution. + +Using a timestamp inside the identifier does give away the creation time in the identifier, but this is not a concern for SensApp as the datetime is important to communicate anyway. + +While [UUID v7](https://www.ietf.org/archive/id/draft-peabody-dispatch-new-uuid-format-04.html#name-uuid-version-7) is a slowly moving draft that is far from being standardised, SensApp should favour UUID v7 for its identifiers. SQlite, PostGreSQL, and ClickHouse do not validate UUIDs and accept 128 bits numbers formated as UUIDs but do not check the versions or the variants. This means that SensApp can use UUID v7 identifiers without any issue and switch to another type of identifier in the future if needed. + +## IEEE 754 Floating Point Numbers + +Numbers in SensApp are often represented as IEEE 754 floating point numbers. This is the standard representation of floating point numbers in most IT system. + +[Quoting the SQLite documentation](https://sqlite.org/floatingpoint.html): + +> **Floating point values are approximate.** +> +> ā€¦ +> +> Surely that level of precision is sufficiant for most purposes? + +There are high chances than your sensors will not be able to measure a value with a precision of 1e-15. If you need to store sensor data with such precision, do not store them as floating point numbers and make sure to not use floating point numbers in your data pipelines. For example the JSON format uses floating point numbers by default. + +One safe solution is to store your values as strings. This is not going to be as efficient, but it should keep the precision along all your data pipelines. + + +## String Deduplication using Dictionaries + +String values should be stored in a separate table to avoid duplication. When using SQLite and PostGreSQL, this is done using a dictionary table and manual code by SensApp. When using ClickHouse, this is done automatically by the database using the [`LowCardinality`](https://clickhouse.com/docs/en/sql-reference/data-types/lowcardinality) data type modifier. + +Using a dictionary improves the performances in most cases. However if many strings are unique, the performances can be worse than using a regular string. + +[The ClickHouse documentation](https://clickhouse.com/docs/en/sql-reference/data-types/lowcardinality) says the following: + +> The efficiency of using LowCardinality data type depends on data diversity. If a dictionary contains less than 10,000 distinct values, then ClickHouse mostly shows higher efficiency of data reading and storing. If a dictionary contains more than 100,000 distinct values, then ClickHouse can perform worse in comparison with using ordinary data types. + +In practice, we expect sensors to not generate unique distinct strings all the time, so using a dictionary should be a good idea for the majority of use cases. + +## Geolocalisation and Coordinates Systems + +In the current version, the geolocalised data doesn't really mind the coordinate system used. The data is likely going to use WGS84, but it could be ETRS89 or something else. It's up to the publisher and the consumer to agree on the coordinate system used, for now. diff --git a/docs/sensapp_logo.png b/docs/sensapp_logo.png new file mode 100644 index 0000000..198f615 Binary files /dev/null and b/docs/sensapp_logo.png differ diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..e7a11a9 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,3 @@ +fn main() { + println!("Hello, world!"); +}