Skip to content
Snippets Groups Projects
Commit eb7e3885 authored by Mohamed Anis Koubaa's avatar Mohamed Anis Koubaa :speech_balloon:
Browse files

construct target metadata using pydantic model.

parent 0fa0e08c
No related branches found
No related tags found
No related merge requests found
Showing
with 4448 additions and 202 deletions
...@@ -40,7 +40,7 @@ class ExcelSheetAdapter: ...@@ -40,7 +40,7 @@ class ExcelSheetAdapter:
except FileNotFoundError as error: except FileNotFoundError as error:
raise SourceNotFound(error) raise SourceNotFound(error)
sheet_names = self.excel_file.sheet_names self.sheet_names = sheet_names = self.excel_file.sheet_names
if data_sheet_name is not None: if data_sheet_name is not None:
if data_sheet_name not in sheet_names: if data_sheet_name not in sheet_names:
raise SourceNotFound() raise SourceNotFound()
...@@ -62,7 +62,7 @@ class ExcelSheetAdapter: ...@@ -62,7 +62,7 @@ class ExcelSheetAdapter:
print(f"This is df-serialization: \n{self.df}") print(f"This is df-serialization: \n{self.df}")
print(f"This is dfm-serialization: \n{self.dfm}") print(f"This is dfm-serialization: \n{self.dfm}")
def get_header(self) -> object: def get_header(self):
""" """
Read column names from excel file. Read column names from excel file.
Compose the header in a compatible way to oep. Compose the header in a compatible way to oep.
...@@ -78,7 +78,7 @@ class ExcelSheetAdapter: ...@@ -78,7 +78,7 @@ class ExcelSheetAdapter:
table_schema_output["columns"].append(json.loads(json_col)) table_schema_output["columns"].append(json.loads(json_col))
return dict(table_schema_output) return dict(table_schema_output)
def get_data(self) -> object: def get_data(self):
""" """
Export data from dataframe to a json object. Export data from dataframe to a json object.
:return: :return:
...@@ -86,10 +86,25 @@ class ExcelSheetAdapter: ...@@ -86,10 +86,25 @@ class ExcelSheetAdapter:
json_data = self.df.to_json(orient='records', indent=4, date_format='iso') json_data = self.df.to_json(orient='records', indent=4, date_format='iso')
return json.loads(json_data) return json.loads(json_data)
def get_metadata(self) -> object: def get_metadata_header(self):
""" """
Export metadata from dataframe to a json object. Export metadata header from dataframe to a json object.
:return: :return:
""" """
json_metadata = self.dfm.to_json(orient='table') json_metadata = self.dfm.to_json(orient='records', indent=4, date_format='iso')
return json.loads(json_metadata)
def get_metadata_section(self, section_name: str):
"""
Export metadata from dataframe to a json.
:return:
"""
if section_name in self.sheet_names:
mdf_section = pd.read_excel(self.link, engine='openpyxl', sheet_name=section_name, index_col=0)
else:
raise SourceNotFound("there is no sheet with the name: " + section_name)
# result_dict = pd.Series(mdf_section.first_or_single.values, index=mdf_section.Property).to_dict()
json_metadata = mdf_section.to_json(indent=4, date_format='iso')
return json.loads(json_metadata) return json.loads(json_metadata)
from pydantic import BaseModel
class OEMeta(BaseModel):
pass
# generated by datamodel-codegen: # generated by data-model-codegen:
# filename: fields.json # filename: fields.json
# timestamp: 2025-01-27T15:02:42+00:00 # timestamp: 2025-01-27T15:02:42+00:00
...@@ -16,7 +16,7 @@ class IsAboutItem(BaseModel): ...@@ -16,7 +16,7 @@ class IsAboutItem(BaseModel):
examples=['wind energy converting unit'], examples=['wind energy converting unit'],
title='Is About Name', title='Is About Name',
) )
field_id: Optional[AnyUrl] = Field( field_id: Optional[AnyUrl | None | str] = Field(
None, None,
alias='@id', alias='@id',
description='The path of the ontology term (IRI).', description='The path of the ontology term (IRI).',
...@@ -50,7 +50,8 @@ class ValueReferenceItem(BaseModel): ...@@ -50,7 +50,8 @@ class ValueReferenceItem(BaseModel):
class FieldModel(BaseModel): class FieldModel(BaseModel):
name: Optional[str] = Field( name: Optional[str] = Field(
..., ...,
description='The name of the field. The name may only consist of lowercase alphanumeric characters or underscores. It must not begin with a number or an underscore.', description='The name of the field. The name may only consist of lowercase alphanumeric characters or '
'underscores. It must not begin with a number or an underscore.',
examples=['year'], examples=['year'],
title='Column Name', title='Column Name',
) )
......
...@@ -34,7 +34,7 @@ class Contributor(BaseModel): ...@@ -34,7 +34,7 @@ class Contributor(BaseModel):
description='An array describing the roles of the contributor.', description='An array describing the roles of the contributor.',
title='Roles', title='Roles',
) )
date: Optional[date] = Field( date_: Optional[date] = Field(
None, None,
description='The date of the final contribution. Date Format is ISO 8601.', description='The date of the final contribution. Date Format is ISO 8601.',
examples=['2024-10-21'], examples=['2024-10-21'],
......
{ {
"name": "oep_oemetadata", "name": "oep_oemetadata",
"title": "OEP OEMetadata", "title": "OEP OEMetadata",
"description": "A collection of tables for the OEMetadata examples.", "description": "A dataset for the OEMetadata examples.",
"id": "https://databus.openenergyplatform.org/oeplatform/reference", "id": "https://databus.openenergyplatform.org/oeplatform/reference",
"resources": [ "resources": [
{ {
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
"subject": [ "subject": [
{ {
"name": "energy", "name": "energy",
"path": "https://openenergyplatform.org/ontology/oeo/OEO_00000150" "@id": "https://openenergyplatform.org/ontology/oeo/OEO_00000150"
} }
], ],
"keywords": [ "keywords": [
...@@ -85,14 +85,14 @@ ...@@ -85,14 +85,14 @@
{ {
"title": "IPCC Sixth Assessment Report (AR6) - Climate Change 2023 - Synthesis Report", "title": "IPCC Sixth Assessment Report (AR6) - Climate Change 2023 - Synthesis Report",
"authors": [ "authors": [
"Hoesung Lee", "Lang Lee",
"José Romero", "José Romero",
"The Core Writing Team" "The Core Writing Team"
], ],
"description": "A Report of the Intergovernmental Panel on Climate Change.", "description": "A Report of the Intergovernmental Panel on Climate Change.",
"publicationYear": "2023", "publicationYear": "2023",
"path": "https://www.ipcc.ch/report/ar6/syr/downloads/report/IPCC_AR6_SYR_FullVolume.pdf", "@id": "https://www.ipcc.ch/report/ar6/syr/downloads/report/IPCC_AR6_SYR_FullVolume.pdf",
"licenses": [ "sourceLicenses": [
{ {
"name": "ODbL-1.0", "name": "ODbL-1.0",
"title": "Open Data Commons Open Database License 1.0", "title": "Open Data Commons Open Database License 1.0",
...@@ -124,7 +124,7 @@ ...@@ -124,7 +124,7 @@
], ],
"date": "2024-10-21", "date": "2024-10-21",
"object": "data and metadata", "object": "data and metadata",
"comment": "Add general context." "comment": "Add metadata example."
} }
], ],
"type": "table", "type": "table",
...@@ -141,14 +141,14 @@ ...@@ -141,14 +141,14 @@
"isAbout": [ "isAbout": [
{ {
"name": "wind energy converting unit", "name": "wind energy converting unit",
"path": "https://openenergyplatform.org/ontology/oeo/OEO_00000044" "@id": "https://openenergyplatform.org/ontology/oeo/OEO_00000044"
} }
], ],
"valueReference": [ "valueReference": [
{ {
"value": "onshore", "value": "onshore",
"name": "onshore wind farm", "name": "onshore wind farm",
"path": "https://openenergyplatform.org/ontology/oeo/OEO_00000311" "@id": "https://openenergyplatform.org/ontology/oeo/OEO_00000311"
} }
] ]
} }
......
from backend_regimo.components.oep_access.oemeta.latest.build_source.schemas.linked_data import Model as LinkedDataModel
from backend_regimo.components.oep_access.oemeta.latest.build_source.schemas.general import Model as GeneralModel
from backend_regimo.components.oep_access.oemeta.latest.build_source.schemas.context import Model as ContextModel
from backend_regimo.components.oep_access.oemeta.latest.build_source.schemas.spatial import Model as SpatialModel
from backend_regimo.components.oep_access.oemeta.latest.build_source.schemas.temporal import Model as TemporalModel
from backend_regimo.components.oep_access.oemeta.latest.build_source.schemas.sources import Model as SourceModel
from backend_regimo.components.oep_access.oemeta.latest.build_source.schemas.licenses import Model as LicenseModel
from backend_regimo.components.oep_access.oemeta.latest.build_source.schemas.provenance import Model as ProvenanceModel
from backend_regimo.components.oep_access.oemeta.latest.build_source.schemas.fields import (Model as FieldsModel,
FieldModel,
IsAboutItem,
ValueReferenceItem,
Schema)
import json
from pydantic.networks import AnyUrl
from pydantic import ValidationError
class OEMeta:
def __init__(self, model_path, schema_path):
super().__init__()
self.model_path = model_path
self.schema_path = schema_path
self.metadata = {}
self.linkedData = LinkedDataModel()
self.general = GeneralModel(name='general_name')
self.context = ContextModel()
self.spatial = SpatialModel()
self.temporal = TemporalModel()
self.sources = SourceModel()
self.licenses = LicenseModel()
self.provenance = ProvenanceModel()
self.resource = FieldsModel()
self.resource.schema_ = Schema(primaryKey=[])
self.resource.schema_.fields = []
def merge_field_from_meta_source(self, meta_source):
"""
append a new field to the target meta.
:param meta_source:
"""
is_about_url = meta_source["isAbout.path"]
value_reference_url = meta_source["valueReference.path"]
# The first iteration is marked by the length of the fields array
if len(self.resource.schema_.fields) == 0:
self.resource.type = 'table'
self.resource.format = 'xlsx'
self.resource.encoding = 'utf-8'
current_is_about = IsAboutItem(name=meta_source["isAbout.name"])
current_is_about.field_id = is_about_url # AnyUrl(url=is_about_url)
current_value_reference = ValueReferenceItem(name=meta_source["valueReference.name"])
current_value_reference.value = meta_source["valueReference.value"]
current_value_reference.field_id = value_reference_url # AnyUrl(url=value_reference_url)
current_field = FieldModel(name=meta_source["name"], type=meta_source["type"], nullable=True)
current_field.isAbout = [current_is_about]
current_field.valueReference = current_value_reference
current_field.unit = meta_source["unit"]
self.resource.schema_.fields.append(current_field)
def merge_general_from_meta_source(self, general_metadata):
name = general_metadata["first_or_single"]["name"]
description = general_metadata["first_or_single"]["description"]
publication_date = general_metadata["first_or_single"]["publicationDate"]
self.general.name = name
self.general.description = description
self.general.publicationDate = publication_date
def merge_licenses_from_meta_source(self, licenses_metadata):
pass
def merge_provenance_from_meta_source(self, provenance_metadata):
pass
def merge_spatial_from_meta_source(self, spatial_metadata):
pass
def merge_temporal_from_meta_source(self, temporal_metadata):
pass
def get_metadata(self, metadata_target_path="table_as_current_meta.json"):
# self.metadata["linkedData"] = self.linkedData.model_dump()
self.metadata["general"] = self.general.model_dump()
self.metadata["context"] = self.context.model_dump()
self.metadata["spatial"] = self.spatial.model_dump()
self.metadata["temporal"] = self.temporal.model_dump()
self.metadata["sources"] = self.sources.model_dump()
self.metadata["licenses"] = self.licenses.model_dump()
self.metadata["provenance"] = self.provenance.model_dump()
self.metadata["resource"] = self.resource.model_dump()
with open(metadata_target_path, "w") as f:
f.write(json.dumps(self.metadata, indent=4))
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
from pathlib import Path from pathlib import Path
from random import randint
LOG_FORMAT = "[%(asctime)s %(module)16s %(levelname)7s] %(message)s" LOG_FORMAT = "[%(asctime)s %(module)16s %(levelname)7s] %(message)s"
...@@ -22,3 +24,6 @@ EXAMPLE_PATH = VERSION_PATH / "example.json" ...@@ -22,3 +24,6 @@ EXAMPLE_PATH = VERSION_PATH / "example.json"
PATH_TO_SOURCE = ('/Users/ot2661/Documents/01_dev/aed_pub/regimo/regimo/services/backend_regimo/data' + PATH_TO_SOURCE = ('/Users/ot2661/Documents/01_dev/aed_pub/regimo/regimo/services/backend_regimo/data' +
'/LLEC_Data/dataset_sample_2rows.xlsx') '/LLEC_Data/dataset_sample_2rows.xlsx')
URL_TO_SOURCE = "https://github.com/koubaa-hmc/LLEC_Data/raw/refs/heads/main/dataset_sample_2rows.xlsx" URL_TO_SOURCE = "https://github.com/koubaa-hmc/LLEC_Data/raw/refs/heads/main/dataset_sample_2rows.xlsx"
TOPIC = "sandbox"
TARGET_TABLE_NAME = f"living_lab_table_{randint(0, 100000)}"
This diff is collapsed.
...@@ -5,66 +5,131 @@ import logging ...@@ -5,66 +5,131 @@ import logging
from pathlib import Path from pathlib import Path
from getpass import getpass from getpass import getpass
from os import environ from os import environ
from random import randint
from oep_client import OepClient from oep_client import OepClient
from oemeta.target_meta import OEMeta from oemeta.target_meta import OEMeta
from excel_sheet_adapter import ExcelSheetAdapter from excel_sheet_adapter import ExcelSheetAdapter
from settings import (RESOLVED_SCHEMA_FILE_NAME, TEMPLATE_PATH, LOG_FORMAT, from settings import (MODEL_PATH, LOG_FORMAT, PATH_TO_SOURCE,
PATH_TO_SOURCE, URL_TO_SOURCE) URL_TO_SOURCE, TOPIC, TARGET_TABLE_NAME,
RESOLVED_SCHEMA_FILE_NAME)
# set default value for the oep token
environ.setdefault('OEP_API_TOKEN',
"f00fa56fa4554da3714832a9452525248f9c4988") def main():
"""
topic = "sandbox" An excel file containing both data and metadata is uploaded to oep.
table = f"living_lab_table_{randint(0, 100000)}" Metadata are attached to the data's table.
A review process will be triggered, as a result the data are published to
# configure logger. Please select your logging level the databus.open-energy-platform.org.
# INFO or DEBUG. Logs are to find under log/ """
logging.basicConfig(level=logging.DEBUG, filename='log/'+table+'.log',
format=LOG_FORMAT, filemode='w') # set default value for the oep token
logger = logging.getLogger(__name__) environ.setdefault('OEP_API_TOKEN',
logger.info('Starting LLEC upload') "f00fa56fa4554da3714832a9452525248f9c4988")
path_to_source = PATH_TO_SOURCE # set values from settings
url_to_source = URL_TO_SOURCE path_to_source = PATH_TO_SOURCE
token = environ.get("OEP_API_TOKEN") or getpass("Enter your OEP API token: ") url_to_source = URL_TO_SOURCE
model_path = MODEL_PATH
# for read/write, we need to add authorization header schema_path = RESOLVED_SCHEMA_FILE_NAME
auth_headers = {"Authorization": "Token %s" % token} table_name = TARGET_TABLE_NAME
table_api_url = f"https://openenergyplatform.org/api/v0/schema/{topic}/tables/{table}/" topic = TOPIC
logger.info("Table API URL: %s" % table_api_url) token = environ.get("OEP_API_TOKEN") or getpass("Enter your OEP API token: ")
# instantiate table source # INFO or DEBUG. Logs are to find under log/
table_source = ExcelSheetAdapter(table_path=Path(path_to_source), logging.basicConfig(level=logging.DEBUG, filename='log/' + table_name + '.log',
data_sheet_name='dataset_sample_2rows_comp', format=LOG_FORMAT, filemode='w')
metadata_sheet_name='dataset_sample_2rows_meta') logger = logging.getLogger(__name__)
# instantiate target meta logger.info('Starting LLEC upload')
target_meta = OEMeta()
# for read/write, we need to add authorization header # auth_headers = {"Authorization": "Token %s" % token}
# get header table_api_url = f"https://openenergyplatform.org/api/v0/schema/{topic}/tables/{table_name}/"
table_schema_definition = table_source.get_header() logger.info("Table API URL: %s" % table_api_url)
# load data from table source # instantiate table source
table_data = table_source.get_data() table_source = ExcelSheetAdapter(table_path=Path(path_to_source), table_url=url_to_source,
data_sheet_name='dataset_sample_2rows_comp',
# load metadata from table source metadata_sheet_name='dataset_sample_2rows_meta')
table_metadata = table_source.get_metadata() # - target metadata
# todo: handle metadata with in an object(oemeta) target_meta = OEMeta(model_path=model_path,
schema_path=schema_path)
# attach a model to target_meta # - client to oep
target_meta.attach_schema(MODEL_PATH) cli = OepClient(token=token,
default_schema=topic)
# instantiate client to oep
cli = OepClient(token=token, default_schema=topic) # get header from table source
cli.create_table(table, table_schema_definition) table_schema_definition = table_source.get_header()
cli.insert_into_table(table, table_data)
# load data from table source
# get metadata (from example file) table_data = table_source.get_data()
# metadata = req.get(
# "https://raw.githubusercontent.com/OpenEnergyPlatform/academy/production/docs/data/tutorial_example_table.metadata.json").json() # merge metadata from table source into target meta
##
# metadata = cli.set_metadata(table, metadata) # load metadata header from table source in <sheet[1]|metadata_sheet_name>
# print(json.dumps(metadata, indent=4)) table_metadata_header = table_source.get_metadata_header()
# example of a field in table_metadata_header
# {
# "index": 0,
# "name": "t_amb",
# "Description": "Ambient temperature",
# "type": "Number",
# "isAbout.name": "temperature",
# "isAbout.path": "http://openenergy-platform.org/ontology/oeo/OEO_00010453",
# "valueReference.value": "temperature",
# "valueReference.name": "temperature",
# "valueReference.path": "http://openenergy-platform.org/ontology/oeo/OEO_00010453",
# "unit": "[°C]"
# },
for field_item in table_metadata_header:
target_meta.merge_field_from_meta_source(field_item)
general_metadata = table_source.get_metadata_section("general")
# example of
# {'first_or_single': {'name': 'Living Lab Measurements', 'topics': 'Measurement', 'title': 'Living Lab Meas',
# 'path': 'https://github.com/koubaa-hmc/LLEC_Data/raw/refs/heads/main/dataset_2rows.xlsx',
# 'description': 'The table is a collection of measurements done in a Living Lab',
# 'languages': '"en-GB"', 'subject.name': 'energy use',
# 'subject.path': 'http://openenergy-platform.org/ontology/oeo/OEO_00010210',
# 'keywords': 'http://openenergy-platform.org/ontology/oeo/OEO_00000150',
# 'publicationDate': '2025-01-28T00:00:00.000', 'embargoPeriod.start': None,
# 'embargoPeriod.end': None, 'embargoPeriod.isActive': False},
# 'second': {'name': None, 'topics': 'Energy', 'title': None, 'path': None, 'description': None, 'languages': None,
# 'subject.name': None, 'subject.path': None,
# 'keywords': 'http://openenergy-platform.org/ontology/oeo/OEO_00000384', 'publicationDate': None,
# 'embargoPeriod.start': None, 'embargoPeriod.end': None, 'embargoPeriod.isActive': None},
# 'third': {'name': None, 'topics': 'Temperature', 'title': None, 'path': None, 'description': None,
# 'languages': None, 'subject.name': None, 'subject.path': None, 'keywords': None, 'pubDate': None,
# 'embargoPeriod.start': None, 'embargoPeriod.end': None, 'embargoPeriod.isActive': None},
# 'fourth': {'name': None, 'topics': None, 'title': None, 'path': None, 'description': None, 'languages': None,
# 'subject.name': None, 'subject.path': None, 'keywords': None, 'publicationDate': None,
# 'embargoPeriod.start': None, 'embargoPeriod.end': None, 'embargoPeriod.isActive': None}}
target_meta.merge_general_from_meta_source(general_metadata)
licenses_metadata = table_source.get_metadata_section("licenses")
target_meta.merge_licenses_from_meta_source(licenses_metadata)
provenance_metadata = table_source.get_metadata_section("provenance")
target_meta.merge_provenance_from_meta_source(provenance_metadata)
spatial_metadata = table_source.get_metadata_section("spatial")
target_meta.merge_spatial_from_meta_source(spatial_metadata)
temporal_metadata = table_source.get_metadata_section("temporal")
target_meta.merge_temporal_from_meta_source(temporal_metadata)
target_meta.get_metadata()
# create table on oep and upload data
cli.create_table(table_name, table_schema_definition)
cli.insert_into_table(table_name, table_data)
# get metadata (from example file)
# metadata = req.get(
# "https://raw.githubusercontent.com/OpenEnergyPlatform/academy/production/docs/data/tutorial_example_table.metadata.json").json()
# metadata = cli.set_metadata(table, metadata)
# print(json.dumps(metadata, indent=4))
if __name__ == '__main__':
main()
Subproject commit ed02372d79e68110b8a3a9d2ff977eac77a024f9 Subproject commit 22cf19d66416505f257a07d1221f3b8d7455c60e
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment