From 929cc67a8fe04b5328861ac5e6ad7a3f7b0d1e56 Mon Sep 17 00:00:00 2001 From: Ivan Kondov <ivan.kondov@kit.edu> Date: Sun, 30 Mar 2025 08:45:10 +0200 Subject: [PATCH 1/4] create value property in order to defer de-serialization --- .../language/utilities/serializable.py | 37 ++++++++++++++----- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/src/virtmat/language/utilities/serializable.py b/src/virtmat/language/utilities/serializable.py index 291ccda7..2544c1f7 100644 --- a/src/virtmat/language/utilities/serializable.py +++ b/src/virtmat/language/utilities/serializable.py @@ -3,6 +3,7 @@ import typing from dataclasses import dataclass from json import JSONEncoder from itertools import islice +from functools import cached_property import numpy import pandas import pint_pandas @@ -31,7 +32,7 @@ def versioned_deserialize(func): version = dct.pop('_version', None) if version == DATA_SCHEMA_VERSION: return func(cls, dct) # current version - if version is None: # non-tagged is implicitly version 6, to be depricated + if version is None: # non-tagged is implicitly version 6, to be deprecated return func(cls, dct) return getattr(cls, f'from_dict_{version}')(cls, dct) return decorator @@ -61,7 +62,7 @@ def get_json_size(obj, max_size): @dataclass class FWDataObject(FWSerializable): """top-level FWSerializable dataclass to hold any FWSerializable objects""" - value: typing.Any = None + __value: typing.Any = None datastore: dict = None filename: str = None _fw_name = '{{' + __loader__.name + '.' + __qualname__ + '}}' @@ -74,10 +75,10 @@ class FWDataObject(FWSerializable): logger = get_fw_logger(f_name) logger.debug('%s: starting', f_name) if self.datastore is None: - dct = recursive_dict(self.value) + dct = recursive_dict(self.__value) b_thres = ioops.DATASTORE_CONFIG['inline-threshold'] b_size = get_json_size(dct, b_thres) - logger.debug('%s: data type: %s', f_name, type(self.value)) + logger.debug('%s: data type: %s', f_name, type(self.__value)) logger.debug('%s: data size [B]: %s', f_name, b_size) logger.debug('%s: inline-threshold [B]: %s', f_name, b_thres) if b_size < b_thres: @@ -87,28 +88,44 @@ class FWDataObject(FWSerializable): logger.info('%s: inline data limit exceeded: %s', f_name, b_size) self.datastore, self.filename = ioops.offload_data(dct) if self.datastore['type'] is None: + # not covered, skip whole block if ioops.DATASTORE_CONFIG['type'] is None logger.info('%s: data not offloaded', f_name) else: logger.info('%s: data offloaded in %s', f_name, self.filename) if self.datastore['type'] is None: logger.debug('%s: datastore: %s', f_name, self.datastore) - return {'value': self.value, 'datastore': self.datastore} + return {'value': self.__value, 'datastore': self.datastore} # fixme: avoid repeated serialize logger.debug('%s: datastore: %s', f_name, self.datastore) logger.debug('%s: data in file: %s', f_name, self.filename) return {'datastore': self.datastore, 'filename': self.filename} @classmethod - @recursive_deserialize @versioned_deserialize def from_dict(cls, m_dict): if 'datastore' in m_dict and m_dict['datastore'] is not None: if m_dict['datastore']['type'] is None: return cls(m_dict['value'], m_dict['datastore']) assert 'filename' in m_dict and m_dict['filename'] is not None - val = ioops.lade_data(m_dict['datastore'], m_dict['filename']) - dval = recursive_deserialize(lambda _, x: x)(None, {'v': val})['v'] - return cls(dval, m_dict['datastore'], m_dict['filename']) - return cls(m_dict['value']) + assert 'value' not in m_dict + return cls(None, m_dict['datastore'], m_dict['filename']) + return cls(m_dict['value']) # not covered + + @cached_property + def value(self): + """deserialize the value""" + def restore_value(val): + @recursive_deserialize + def restore_from_dict(_, dct): + return dct + return restore_from_dict(None, {'v': val})['v'] + + if self.datastore is not None: + if self.datastore['type'] is None: + return restore_value(self.__value) + assert self.filename is not None + assert self.__value is None + self.__value = ioops.lade_data(self.datastore, self.filename) + return restore_value(self.__value) # fixme: do not restore here but at end of if block @classmethod def from_obj(cls, obj): -- GitLab From 607be1200e13d7f315e8aa8a5613bfd26764a3d0 Mon Sep 17 00:00:00 2001 From: Ivan Kondov <ivan.kondov@kit.edu> Date: Sun, 30 Mar 2025 08:47:02 +0200 Subject: [PATCH 2/4] include a sentence about the type: null option in the docs --- docs/io.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/io.md b/docs/io.md index b3b53275..071ffe4c 100644 --- a/docs/io.md +++ b/docs/io.md @@ -70,8 +70,8 @@ In the workflow executor, sometimes a computed parameter value allocates too muc If you wish to change these settings you can create a custom datastore configuration file with these contents: ```yaml -inline-threshold: 100000 # threshold for offloading data, in bytes type: file # can be 'gridfs' for database file object storage +inline-threshold: 100000 # threshold for offloading data, in bytes path: /path/to/local/workspace # directory used if type is 'file' name: vre_language_datastore # collection name used if type is 'gridfs' launchpad: /path/to/launchpad.yaml # path to custom launchpad file used if type is 'gridfs' @@ -79,7 +79,8 @@ format: json # 'yaml' and 'hdf5' not implemented compress: true # use compression ``` -The type `file` triggers storage in local files in `path`. The default `path` is `$HOME/.fireworks/vre-language-datastore`. The `path` will be created automatically if it does not exist. The default `launchpad` is `LAUNCHPAD_LOC` as provided by FireWorks. All other default settings are shown in the example above. +The `type: file` setting triggers storage in local files in `path`. Setting `type: null` deactivates the mechanism regardless of the other settings. +The default `path` is `$HOME/.fireworks/vre-language-datastore`. The `path` will be created automatically if it does not exist. The default `launchpad` is `LAUNCHPAD_LOC` as provided by FireWorks. All other default settings are shown in the example above. The default path of the datastore configuration file is `$HOME/.fireworks/datastore_config.yaml`. It will be automatically loaded, if the file exists. If your datastore configuration has a different location then you must set the environment variable -- GitLab From 416e2dc59210dc2eb313fb45daa11bd2e20c97ce Mon Sep 17 00:00:00 2001 From: Ivan Kondov <ivan.kondov@kit.edu> Date: Sun, 30 Mar 2025 11:25:37 +0200 Subject: [PATCH 3/4] explicitly assert datastore is not None, versions 6 and 7 compatible --- .../language/utilities/serializable.py | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/virtmat/language/utilities/serializable.py b/src/virtmat/language/utilities/serializable.py index 2544c1f7..7594c053 100644 --- a/src/virtmat/language/utilities/serializable.py +++ b/src/virtmat/language/utilities/serializable.py @@ -102,30 +102,32 @@ class FWDataObject(FWSerializable): @classmethod @versioned_deserialize def from_dict(cls, m_dict): - if 'datastore' in m_dict and m_dict['datastore'] is not None: - if m_dict['datastore']['type'] is None: - return cls(m_dict['value'], m_dict['datastore']) - assert 'filename' in m_dict and m_dict['filename'] is not None - assert 'value' not in m_dict - return cls(None, m_dict['datastore'], m_dict['filename']) - return cls(m_dict['value']) # not covered + assert 'datastore' in m_dict and m_dict['datastore'] is not None + if m_dict['datastore']['type'] is None: + return cls(m_dict['value'], m_dict['datastore']) + assert 'filename' in m_dict and m_dict['filename'] is not None + assert 'value' not in m_dict + return cls(None, m_dict['datastore'], m_dict['filename']) @cached_property def value(self): - """deserialize the value""" + """restore the value if datastore is defined, otherwise just return""" + assert self.datastore is not None + # if self.datastore is None: + # return self.__value # if created with from_obj, no known use case + def restore_value(val): @recursive_deserialize def restore_from_dict(_, dct): return dct return restore_from_dict(None, {'v': val})['v'] - if self.datastore is not None: - if self.datastore['type'] is None: - return restore_value(self.__value) - assert self.filename is not None - assert self.__value is None - self.__value = ioops.lade_data(self.datastore, self.filename) - return restore_value(self.__value) # fixme: do not restore here but at end of if block + if self.datastore['type'] is None: + return restore_value(self.__value) + assert self.filename is not None + assert self.__value is None + self.__value = ioops.lade_data(self.datastore, self.filename) + return restore_value(self.__value) @classmethod def from_obj(cls, obj): -- GitLab From 754ccf9f082991f84aac5835769d0b30ee0d723c Mon Sep 17 00:00:00 2001 From: Ivan Kondov <ivan.kondov@kit.edu> Date: Sun, 30 Mar 2025 20:01:57 +0200 Subject: [PATCH 4/4] avoid unnecessary serialization calls, do some refactoring to_dict() --- .../language/utilities/serializable.py | 42 +++++++++---------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/src/virtmat/language/utilities/serializable.py b/src/virtmat/language/utilities/serializable.py index 7594c053..28051cfa 100644 --- a/src/virtmat/language/utilities/serializable.py +++ b/src/virtmat/language/utilities/serializable.py @@ -68,34 +68,31 @@ class FWDataObject(FWSerializable): _fw_name = '{{' + __loader__.name + '.' + __qualname__ + '}}' @serialize_fw - @recursive_serialize @versioned_serialize def to_dict(self): f_name = f'{__name__}.{self.__class__.__name__}.to_dict()' logger = get_fw_logger(f_name) logger.debug('%s: starting', f_name) if self.datastore is None: - dct = recursive_dict(self.__value) - b_thres = ioops.DATASTORE_CONFIG['inline-threshold'] - b_size = get_json_size(dct, b_thres) logger.debug('%s: data type: %s', f_name, type(self.__value)) - logger.debug('%s: data size [B]: %s', f_name, b_size) - logger.debug('%s: inline-threshold [B]: %s', f_name, b_thres) - if b_size < b_thres: - self.datastore = {'type': None} - logger.info('%s: data not offloaded', f_name) - return {'value': dct, 'datastore': self.datastore} - logger.info('%s: inline data limit exceeded: %s', f_name, b_size) - self.datastore, self.filename = ioops.offload_data(dct) - if self.datastore['type'] is None: - # not covered, skip whole block if ioops.DATASTORE_CONFIG['type'] is None - logger.info('%s: data not offloaded', f_name) - else: - logger.info('%s: data offloaded in %s', f_name, self.filename) - if self.datastore['type'] is None: - logger.debug('%s: datastore: %s', f_name, self.datastore) - return {'value': self.__value, 'datastore': self.datastore} # fixme: avoid repeated serialize + self.__value = recursive_dict(self.__value) + if ioops.DATASTORE_CONFIG['type'] is not None: + b_thres = ioops.DATASTORE_CONFIG['inline-threshold'] + b_size = get_json_size(self.__value, b_thres) + logger.debug('%s: data size [B]: %s', f_name, b_size) + logger.debug('%s: inline-threshold [B]: %s', f_name, b_thres) + if b_size > b_thres: + logger.info('%s: inline data limit exceeded: %s', f_name, b_size) + self.datastore, self.filename = ioops.offload_data(self.__value) + assert self.datastore['type'] is not None + logger.info('%s: data offloaded in %s', f_name, self.filename) + return {'datastore': self.datastore, 'filename': self.filename} + self.datastore = {'type': None} + logger.info('%s: data not offloaded', f_name) + return {'value': self.__value, 'datastore': self.datastore} logger.debug('%s: datastore: %s', f_name, self.datastore) + if self.datastore['type'] is None: + return {'value': self.__value, 'datastore': self.datastore} logger.debug('%s: data in file: %s', f_name, self.filename) return {'datastore': self.datastore, 'filename': self.filename} @@ -113,8 +110,9 @@ class FWDataObject(FWSerializable): def value(self): """restore the value if datastore is defined, otherwise just return""" assert self.datastore is not None - # if self.datastore is None: - # return self.__value # if created with from_obj, no known use case + # value created with from_obj must not be restored, no known use case + # if self.datastore is None: # from_obj() sets datastore to None + # return self.__value def restore_value(val): @recursive_deserialize -- GitLab