Skip to content

Commit

Permalink
Merge pull request #18 from cbdq-io/feature/12-add-version-and-create…
Browse files Browse the repository at this point in the history
…d-to-the-met-office-data-package-file

Feature/12 add version and created to the met office data package file
  • Loading branch information
dallinb authored Jun 8, 2024
2 parents 8d5bb8e + 1e4806f commit 823817a
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 79 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
run: make lint

- name: Build
run: ./uk/gov/metoffice/historic_station_data/scripts/etl.py
run: make build

- name: Test
run: make test
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@

### New

* Add the created timestamp to the datapackage.json file. [Ben Dalling]

* Add the version to the datapackage.json file. [Ben Dalling]

* Add the GPL-3.0 licence. [Ben Dalling]

### Fix
Expand Down
6 changes: 4 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@

GIT_TAG = 1.0.0

all: lint build test
all: lint avro build test

build:
avro:
make -C avro

build:
./uk/gov/metoffice/historic_station_data/scripts/etl.py

changelog:
Expand Down
180 changes: 104 additions & 76 deletions uk/gov/metoffice/historic_station_data/datapackage.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,21 @@
{
"name": "io.cbdq.metoffice.historic_station_data",
"title": "Met Office Historical Station Data",
"contributers": [
{
"path": "https://www.cbdq.io",
"role": "wrangler",
"title": "Cloud Based DQ Ltd."
}
],
"created": "2024-06-08T19:12:11+00:00",
"description": "# Met Office Historical Station Data\n\nA dataset wrangled from the UK Met Office and available at\n<https://www.metoffice.gov.uk/research/climate/maps-and-data/historic-station-data>.\n\nThe wrangling consists of:\n\n- Cleaning some data errors.\n- Setting the sun instrument type to the last known instrument for a station.",
"profile": "tabular-data-package",
"homepage": "https://github.com/cbdq-io/datasets/tree/main/uk/gov/metoffice/historic_station_data",
"image": "https://upload.wikimedia.org/wikipedia/en/thumb/f/f4/Met_Office.svg/150px-Met_Office.svg.png",
"keywords": [
"climate",
"MetOffice",
"change",
"historic"
],
"licenses": [
{
"name": "OGL-UK-3.0",
Expand All @@ -15,39 +28,16 @@
"title": "GNU General Public License 3.0"
}
],
"homepage": "https://github.com/cbdq-io/datasets/tree/main/uk/gov/metoffice/historic_station_data",
"sources": [
{
"title": "Met Office Historical Station Data",
"path": "https://www.metoffice.gov.uk/research/climate/maps-and-data/historic-station-data"
}
],
"contributers": [
{
"title": "Cloud Based DQ Ltd.",
"path": "https://www.cbdq.io",
"role": "wrangler"
}
],
"keywords": [
"climate",
"MetOffice",
"change",
"historic"
],
"image": "https://upload.wikimedia.org/wikipedia/en/thumb/f/f4/Met_Office.svg/150px-Met_Office.svg.png",
"name": "io.cbdq.metoffice.historic_station_data",
"profile": "tabular-data-package",
"resources": [
{
"profile": "tabular-data-resource",
"name": "historic_station_data",
"path": "data/historic-station-data.csv",
"profile": "tabular-data-resource",
"schema": {
"missingValues": [""],
"fields": [
{
"name": "station_name",
"type": "string",
"description": "The name of the station that the data refers to.",
{
"constraints": {
"enum": [
"aberporth",
Expand Down Expand Up @@ -88,99 +78,137 @@
"wickairport",
"yeovilton"
]
}
},
"description": "The name of the station that the data refers to.",
"name": "station_name",
"type": "string"
},
{
"description": "Free text associated with the station. Will be missing if the row is for a data record.",
"name": "metadata",
"type": "string",
"description": "Free text associated with the station. Will be missing if the row is for a data record."
"type": "string"
},
{
"name": "month",
"type": "yearmonth",
"description": "The ISO 8601 month. Will be missing if the row is a metadata record.",
"example": "2024-04"
"example": "2024-04",
"name": "month",
"type": "yearmonth"
},
{
"description": "Mean daily maximum temperature (centigrade).",
"name": "tmax",
"type": "number",
"description": "Mean daily maximum temperature (centigrade)."
"type": "number"
},
{
"description": "True if tmax is estimated.",
"falseValues": [
"False"
],
"name": "tmax_is_estimated",
"type": "boolean",
"trueValues": ["True"],
"falseValues": ["False"],
"description": "True if tmax is estimated."
"trueValues": [
"True"
],
"type": "boolean"
},
{
"description": "Mean daily minimum temperature (centigrade).",
"name": "tmin",
"type": "number",
"description": "Mean daily minimum temperature (centigrade)."
"type": "number"
},
{
"description": "True if tmin is estimated.",
"falseValues": [
"False"
],
"name": "tmin_is_estimated",
"type": "boolean",
"trueValues": ["True"],
"falseValues": ["False"],
"description": "True if tmin is estimated."
"trueValues": [
"True"
],
"type": "boolean"
},
{
"description": "Days of air frost.",
"name": "af",
"type": "integer",
"description": "Days of air frost."
"type": "integer"
},
{
"description": "True if af is estimated.",
"falseValues": [
"False"
],
"name": "af_is_estimated",
"type": "boolean",
"trueValues": ["True"],
"falseValues": ["False"],
"description":"True if af is estimated."
"trueValues": [
"True"
],
"type": "boolean"
},
{
"description": "Total rainfall (mm).",
"name": "rain",
"type": "number",
"description": "Total rainfall (mm)."
"type": "number"
},
{
"description": "True if rain is estimated.",
"falseValues": [
"False"
],
"name": "rain_is_estimated",
"type": "boolean",
"trueValues": ["True"],
"falseValues": ["False"],
"description": "True if rain is estimated."
"trueValues": [
"True"
],
"type": "boolean"
},
{
"description": "Total sunshine duration.",
"name": "sun",
"type": "number",
"description": "Total sunshine duration."
"type": "number"
},
{
"description": "True if sun is estimated.",
"falseValues": [
"False"
],
"name": "sun_is_estimated",
"type": "boolean",
"trueValues": ["True"],
"falseValues": ["False"],
"description": "True if sun is estimated."
"trueValues": [
"True"
],
"type": "boolean"
},
{
"name": "sun_instrument",
"type": "string",
"description": "The sun instrument installed at the station.",
"constraints": {
"enum": [
"Campbell Stokes recorder",
"Kipp & Zonen sensor"
]
}
},
"description": "The sun instrument installed at the station.",
"name": "sun_instrument",
"type": "string"
},
{
"description": "Data are indicated as provisional until the full network quality control has been carried out. After this, data are final.",
"falseValues": [
"False"
],
"name": "provisional",
"type": "boolean",
"trueValues": ["True"],
"falseValues": ["False"],
"description": "Data are indicated as provisional until the full network quality control has been carried out. After this, data are final."
"trueValues": [
"True"
],
"type": "boolean"
}
],
"missingValues": [
""
]
}
}
]
}
],
"sources": [
{
"path": "https://www.metoffice.gov.uk/research/climate/maps-and-data/historic-station-data",
"title": "Met Office Historical Station Data"
}
],
"title": "Met Office Historical Station Data",
"version": "1.0.0"
}
17 changes: 17 additions & 0 deletions uk/gov/metoffice/historic_station_data/scripts/etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
see <http://www.gnu.org/licenses/>.
"""
import argparse
import datetime
import itertools
import json
import logging
import os
import re
Expand Down Expand Up @@ -499,6 +501,7 @@ def __init__(self, df: pd.DataFrame) -> None:
self.df = df
self.load_archives()
self.load_data()
self.load_package_data()

def load_archive(self, station_name: str) -> None:
"""
Expand Down Expand Up @@ -531,6 +534,20 @@ def load_data(self) -> None:
logger.info(f'Loading transformed data to "{data_filename}".')
self.df.to_csv(data_filename, index=False)

def load_package_data(self) -> None:
"""Update tha datapackage.json file."""
data_package_file_name = f'{BASE_DIRECTORY}/datapackage.json'

with open(data_package_file_name) as stream:
data_package = json.load(stream)

data_package['version'] = os.environ['GIT_TAG']
now = datetime.datetime.now(datetime.UTC)
data_package['created'] = now.isoformat(timespec='seconds')

with open(data_package_file_name, 'w') as stream:
json.dump(data_package, stream, indent=4, sort_keys=True)

def qualify_metadata_duplication(self, row: pd.Series) -> bool:
"""
Qualify if the metadata is duplicated or empty.
Expand Down

0 comments on commit 823817a

Please sign in to comment.