diff --git a/docs/io.md b/docs/io.md index b3b5327588233a74721592491c965d2c569247f6..071ffe4c54452fa2a0892d5f969147881154cfe5 100644 --- a/docs/io.md +++ b/docs/io.md @@ -70,8 +70,8 @@ In the workflow executor, sometimes a computed parameter value allocates too muc If you wish to change these settings you can create a custom datastore configuration file with these contents: ```yaml -inline-threshold: 100000 # threshold for offloading data, in bytes type: file # can be 'gridfs' for database file object storage +inline-threshold: 100000 # threshold for offloading data, in bytes path: /path/to/local/workspace # directory used if type is 'file' name: vre_language_datastore # collection name used if type is 'gridfs' launchpad: /path/to/launchpad.yaml # path to custom launchpad file used if type is 'gridfs' @@ -79,7 +79,8 @@ format: json # 'yaml' and 'hdf5' not implemented compress: true # use compression ``` -The type `file` triggers storage in local files in `path`. The default `path` is `$HOME/.fireworks/vre-language-datastore`. The `path` will be created automatically if it does not exist. The default `launchpad` is `LAUNCHPAD_LOC` as provided by FireWorks. All other default settings are shown in the example above. +The `type: file` setting triggers storage in local files in `path`. Setting `type: null` deactivates the mechanism regardless of the other settings. +The default `path` is `$HOME/.fireworks/vre-language-datastore`. The `path` will be created automatically if it does not exist. The default `launchpad` is `LAUNCHPAD_LOC` as provided by FireWorks. All other default settings are shown in the example above. The default path of the datastore configuration file is `$HOME/.fireworks/datastore_config.yaml`. It will be automatically loaded, if the file exists. If your datastore configuration has a different location then you must set the environment variable diff --git a/src/virtmat/language/utilities/serializable.py b/src/virtmat/language/utilities/serializable.py index 291ccda718f959300e3521abe4b941d2d0f67b84..28051cfa35f9ea9015e68b7ca4284c01a387a196 100644 --- a/src/virtmat/language/utilities/serializable.py +++ b/src/virtmat/language/utilities/serializable.py @@ -3,6 +3,7 @@ import typing from dataclasses import dataclass from json import JSONEncoder from itertools import islice +from functools import cached_property import numpy import pandas import pint_pandas @@ -31,7 +32,7 @@ def versioned_deserialize(func): version = dct.pop('_version', None) if version == DATA_SCHEMA_VERSION: return func(cls, dct) # current version - if version is None: # non-tagged is implicitly version 6, to be depricated + if version is None: # non-tagged is implicitly version 6, to be deprecated return func(cls, dct) return getattr(cls, f'from_dict_{version}')(cls, dct) return decorator @@ -61,54 +62,70 @@ def get_json_size(obj, max_size): @dataclass class FWDataObject(FWSerializable): """top-level FWSerializable dataclass to hold any FWSerializable objects""" - value: typing.Any = None + __value: typing.Any = None datastore: dict = None filename: str = None _fw_name = '{{' + __loader__.name + '.' + __qualname__ + '}}' @serialize_fw - @recursive_serialize @versioned_serialize def to_dict(self): f_name = f'{__name__}.{self.__class__.__name__}.to_dict()' logger = get_fw_logger(f_name) logger.debug('%s: starting', f_name) if self.datastore is None: - dct = recursive_dict(self.value) - b_thres = ioops.DATASTORE_CONFIG['inline-threshold'] - b_size = get_json_size(dct, b_thres) - logger.debug('%s: data type: %s', f_name, type(self.value)) - logger.debug('%s: data size [B]: %s', f_name, b_size) - logger.debug('%s: inline-threshold [B]: %s', f_name, b_thres) - if b_size < b_thres: - self.datastore = {'type': None} - logger.info('%s: data not offloaded', f_name) - return {'value': dct, 'datastore': self.datastore} - logger.info('%s: inline data limit exceeded: %s', f_name, b_size) - self.datastore, self.filename = ioops.offload_data(dct) - if self.datastore['type'] is None: - logger.info('%s: data not offloaded', f_name) - else: - logger.info('%s: data offloaded in %s', f_name, self.filename) - if self.datastore['type'] is None: - logger.debug('%s: datastore: %s', f_name, self.datastore) - return {'value': self.value, 'datastore': self.datastore} + logger.debug('%s: data type: %s', f_name, type(self.__value)) + self.__value = recursive_dict(self.__value) + if ioops.DATASTORE_CONFIG['type'] is not None: + b_thres = ioops.DATASTORE_CONFIG['inline-threshold'] + b_size = get_json_size(self.__value, b_thres) + logger.debug('%s: data size [B]: %s', f_name, b_size) + logger.debug('%s: inline-threshold [B]: %s', f_name, b_thres) + if b_size > b_thres: + logger.info('%s: inline data limit exceeded: %s', f_name, b_size) + self.datastore, self.filename = ioops.offload_data(self.__value) + assert self.datastore['type'] is not None + logger.info('%s: data offloaded in %s', f_name, self.filename) + return {'datastore': self.datastore, 'filename': self.filename} + self.datastore = {'type': None} + logger.info('%s: data not offloaded', f_name) + return {'value': self.__value, 'datastore': self.datastore} logger.debug('%s: datastore: %s', f_name, self.datastore) + if self.datastore['type'] is None: + return {'value': self.__value, 'datastore': self.datastore} logger.debug('%s: data in file: %s', f_name, self.filename) return {'datastore': self.datastore, 'filename': self.filename} @classmethod - @recursive_deserialize @versioned_deserialize def from_dict(cls, m_dict): - if 'datastore' in m_dict and m_dict['datastore'] is not None: - if m_dict['datastore']['type'] is None: - return cls(m_dict['value'], m_dict['datastore']) - assert 'filename' in m_dict and m_dict['filename'] is not None - val = ioops.lade_data(m_dict['datastore'], m_dict['filename']) - dval = recursive_deserialize(lambda _, x: x)(None, {'v': val})['v'] - return cls(dval, m_dict['datastore'], m_dict['filename']) - return cls(m_dict['value']) + assert 'datastore' in m_dict and m_dict['datastore'] is not None + if m_dict['datastore']['type'] is None: + return cls(m_dict['value'], m_dict['datastore']) + assert 'filename' in m_dict and m_dict['filename'] is not None + assert 'value' not in m_dict + return cls(None, m_dict['datastore'], m_dict['filename']) + + @cached_property + def value(self): + """restore the value if datastore is defined, otherwise just return""" + assert self.datastore is not None + # value created with from_obj must not be restored, no known use case + # if self.datastore is None: # from_obj() sets datastore to None + # return self.__value + + def restore_value(val): + @recursive_deserialize + def restore_from_dict(_, dct): + return dct + return restore_from_dict(None, {'v': val})['v'] + + if self.datastore['type'] is None: + return restore_value(self.__value) + assert self.filename is not None + assert self.__value is None + self.__value = ioops.lade_data(self.datastore, self.filename) + return restore_value(self.__value) @classmethod def from_obj(cls, obj):