From 929cc67a8fe04b5328861ac5e6ad7a3f7b0d1e56 Mon Sep 17 00:00:00 2001
From: Ivan Kondov <ivan.kondov@kit.edu>
Date: Sun, 30 Mar 2025 08:45:10 +0200
Subject: [PATCH 1/4] create value property in order to defer de-serialization

---
 .../language/utilities/serializable.py        | 37 ++++++++++++++-----
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/src/virtmat/language/utilities/serializable.py b/src/virtmat/language/utilities/serializable.py
index 291ccda7..2544c1f7 100644
--- a/src/virtmat/language/utilities/serializable.py
+++ b/src/virtmat/language/utilities/serializable.py
@@ -3,6 +3,7 @@ import typing
 from dataclasses import dataclass
 from json import JSONEncoder
 from itertools import islice
+from functools import cached_property
 import numpy
 import pandas
 import pint_pandas
@@ -31,7 +32,7 @@ def versioned_deserialize(func):
         version = dct.pop('_version', None)
         if version == DATA_SCHEMA_VERSION:
             return func(cls, dct)  # current version
-        if version is None:  # non-tagged is implicitly version 6, to be depricated
+        if version is None:  # non-tagged is implicitly version 6, to be deprecated
             return func(cls, dct)
         return getattr(cls, f'from_dict_{version}')(cls, dct)
     return decorator
@@ -61,7 +62,7 @@ def get_json_size(obj, max_size):
 @dataclass
 class FWDataObject(FWSerializable):
     """top-level FWSerializable dataclass to hold any FWSerializable objects"""
-    value: typing.Any = None
+    __value: typing.Any = None
     datastore: dict = None
     filename: str = None
     _fw_name = '{{' + __loader__.name + '.' + __qualname__ + '}}'
@@ -74,10 +75,10 @@ class FWDataObject(FWSerializable):
         logger = get_fw_logger(f_name)
         logger.debug('%s: starting', f_name)
         if self.datastore is None:
-            dct = recursive_dict(self.value)
+            dct = recursive_dict(self.__value)
             b_thres = ioops.DATASTORE_CONFIG['inline-threshold']
             b_size = get_json_size(dct, b_thres)
-            logger.debug('%s: data type: %s', f_name, type(self.value))
+            logger.debug('%s: data type: %s', f_name, type(self.__value))
             logger.debug('%s: data size [B]: %s', f_name, b_size)
             logger.debug('%s: inline-threshold [B]: %s', f_name, b_thres)
             if b_size < b_thres:
@@ -87,28 +88,44 @@ class FWDataObject(FWSerializable):
             logger.info('%s: inline data limit exceeded: %s', f_name, b_size)
             self.datastore, self.filename = ioops.offload_data(dct)
             if self.datastore['type'] is None:
+                # not covered, skip whole block if ioops.DATASTORE_CONFIG['type'] is None
                 logger.info('%s: data not offloaded', f_name)
             else:
                 logger.info('%s: data offloaded in %s', f_name, self.filename)
         if self.datastore['type'] is None:
             logger.debug('%s: datastore: %s', f_name, self.datastore)
-            return {'value': self.value, 'datastore': self.datastore}
+            return {'value': self.__value, 'datastore': self.datastore}  # fixme: avoid repeated serialize
         logger.debug('%s: datastore: %s', f_name, self.datastore)
         logger.debug('%s: data in file: %s', f_name, self.filename)
         return {'datastore': self.datastore, 'filename': self.filename}
 
     @classmethod
-    @recursive_deserialize
     @versioned_deserialize
     def from_dict(cls, m_dict):
         if 'datastore' in m_dict and m_dict['datastore'] is not None:
             if m_dict['datastore']['type'] is None:
                 return cls(m_dict['value'], m_dict['datastore'])
             assert 'filename' in m_dict and m_dict['filename'] is not None
-            val = ioops.lade_data(m_dict['datastore'], m_dict['filename'])
-            dval = recursive_deserialize(lambda _, x: x)(None, {'v': val})['v']
-            return cls(dval, m_dict['datastore'], m_dict['filename'])
-        return cls(m_dict['value'])
+            assert 'value' not in m_dict
+            return cls(None, m_dict['datastore'], m_dict['filename'])
+        return cls(m_dict['value'])  # not covered
+
+    @cached_property
+    def value(self):
+        """deserialize the value"""
+        def restore_value(val):
+            @recursive_deserialize
+            def restore_from_dict(_, dct):
+                return dct
+            return restore_from_dict(None, {'v': val})['v']
+
+        if self.datastore is not None:
+            if self.datastore['type'] is None:
+                return restore_value(self.__value)
+            assert self.filename is not None
+            assert self.__value is None
+            self.__value = ioops.lade_data(self.datastore, self.filename)
+        return restore_value(self.__value)  # fixme: do not restore here but at end of if block
 
     @classmethod
     def from_obj(cls, obj):
-- 
GitLab


From 607be1200e13d7f315e8aa8a5613bfd26764a3d0 Mon Sep 17 00:00:00 2001
From: Ivan Kondov <ivan.kondov@kit.edu>
Date: Sun, 30 Mar 2025 08:47:02 +0200
Subject: [PATCH 2/4] include a sentence about the type: null option in the
 docs

---
 docs/io.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/io.md b/docs/io.md
index b3b53275..071ffe4c 100644
--- a/docs/io.md
+++ b/docs/io.md
@@ -70,8 +70,8 @@ In the workflow executor, sometimes a computed parameter value allocates too muc
 If you wish to change these settings you can create a custom datastore configuration file with these contents:
 
 ```yaml
-inline-threshold: 100000            # threshold for offloading data, in bytes
 type: file                          # can be 'gridfs' for database file object storage
+inline-threshold: 100000            # threshold for offloading data, in bytes
 path: /path/to/local/workspace      # directory used if type is 'file'
 name: vre_language_datastore        # collection name used if type is 'gridfs'
 launchpad: /path/to/launchpad.yaml  # path to custom launchpad file used if type is 'gridfs'
@@ -79,7 +79,8 @@ format: json                        # 'yaml' and 'hdf5' not implemented
 compress: true                      # use compression
 ```
 
-The type `file` triggers storage in local files in `path`. The default `path` is `$HOME/.fireworks/vre-language-datastore`. The `path` will be created automatically if it does not exist. The default `launchpad` is `LAUNCHPAD_LOC` as provided by FireWorks. All other default settings are shown in the example above.
+The `type: file` setting triggers storage in local files in `path`. Setting `type: null` deactivates the mechanism regardless of the other settings.
+The default `path` is `$HOME/.fireworks/vre-language-datastore`. The `path` will be created automatically if it does not exist. The default `launchpad` is `LAUNCHPAD_LOC` as provided by FireWorks. All other default settings are shown in the example above.
 
 The default path of the datastore configuration file is `$HOME/.fireworks/datastore_config.yaml`. It will be automatically loaded, if the file exists. If your datastore configuration has a different location then you must set the environment variable
 
-- 
GitLab


From 416e2dc59210dc2eb313fb45daa11bd2e20c97ce Mon Sep 17 00:00:00 2001
From: Ivan Kondov <ivan.kondov@kit.edu>
Date: Sun, 30 Mar 2025 11:25:37 +0200
Subject: [PATCH 3/4] explicitly assert datastore is not None, versions 6 and 7
 compatible

---
 .../language/utilities/serializable.py        | 32 ++++++++++---------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/virtmat/language/utilities/serializable.py b/src/virtmat/language/utilities/serializable.py
index 2544c1f7..7594c053 100644
--- a/src/virtmat/language/utilities/serializable.py
+++ b/src/virtmat/language/utilities/serializable.py
@@ -102,30 +102,32 @@ class FWDataObject(FWSerializable):
     @classmethod
     @versioned_deserialize
     def from_dict(cls, m_dict):
-        if 'datastore' in m_dict and m_dict['datastore'] is not None:
-            if m_dict['datastore']['type'] is None:
-                return cls(m_dict['value'], m_dict['datastore'])
-            assert 'filename' in m_dict and m_dict['filename'] is not None
-            assert 'value' not in m_dict
-            return cls(None, m_dict['datastore'], m_dict['filename'])
-        return cls(m_dict['value'])  # not covered
+        assert 'datastore' in m_dict and m_dict['datastore'] is not None
+        if m_dict['datastore']['type'] is None:
+            return cls(m_dict['value'], m_dict['datastore'])
+        assert 'filename' in m_dict and m_dict['filename'] is not None
+        assert 'value' not in m_dict
+        return cls(None, m_dict['datastore'], m_dict['filename'])
 
     @cached_property
     def value(self):
-        """deserialize the value"""
+        """restore the value if datastore is defined, otherwise just return"""
+        assert self.datastore is not None
+        # if self.datastore is None:
+        #    return self.__value  # if created with from_obj, no known use case
+
         def restore_value(val):
             @recursive_deserialize
             def restore_from_dict(_, dct):
                 return dct
             return restore_from_dict(None, {'v': val})['v']
 
-        if self.datastore is not None:
-            if self.datastore['type'] is None:
-                return restore_value(self.__value)
-            assert self.filename is not None
-            assert self.__value is None
-            self.__value = ioops.lade_data(self.datastore, self.filename)
-        return restore_value(self.__value)  # fixme: do not restore here but at end of if block
+        if self.datastore['type'] is None:
+            return restore_value(self.__value)
+        assert self.filename is not None
+        assert self.__value is None
+        self.__value = ioops.lade_data(self.datastore, self.filename)
+        return restore_value(self.__value)
 
     @classmethod
     def from_obj(cls, obj):
-- 
GitLab


From 754ccf9f082991f84aac5835769d0b30ee0d723c Mon Sep 17 00:00:00 2001
From: Ivan Kondov <ivan.kondov@kit.edu>
Date: Sun, 30 Mar 2025 20:01:57 +0200
Subject: [PATCH 4/4] avoid unnecessary serialization calls, do some
 refactoring to_dict()

---
 .../language/utilities/serializable.py        | 42 +++++++++----------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/src/virtmat/language/utilities/serializable.py b/src/virtmat/language/utilities/serializable.py
index 7594c053..28051cfa 100644
--- a/src/virtmat/language/utilities/serializable.py
+++ b/src/virtmat/language/utilities/serializable.py
@@ -68,34 +68,31 @@ class FWDataObject(FWSerializable):
     _fw_name = '{{' + __loader__.name + '.' + __qualname__ + '}}'
 
     @serialize_fw
-    @recursive_serialize
     @versioned_serialize
     def to_dict(self):
         f_name = f'{__name__}.{self.__class__.__name__}.to_dict()'
         logger = get_fw_logger(f_name)
         logger.debug('%s: starting', f_name)
         if self.datastore is None:
-            dct = recursive_dict(self.__value)
-            b_thres = ioops.DATASTORE_CONFIG['inline-threshold']
-            b_size = get_json_size(dct, b_thres)
             logger.debug('%s: data type: %s', f_name, type(self.__value))
-            logger.debug('%s: data size [B]: %s', f_name, b_size)
-            logger.debug('%s: inline-threshold [B]: %s', f_name, b_thres)
-            if b_size < b_thres:
-                self.datastore = {'type': None}
-                logger.info('%s: data not offloaded', f_name)
-                return {'value': dct, 'datastore': self.datastore}
-            logger.info('%s: inline data limit exceeded: %s', f_name, b_size)
-            self.datastore, self.filename = ioops.offload_data(dct)
-            if self.datastore['type'] is None:
-                # not covered, skip whole block if ioops.DATASTORE_CONFIG['type'] is None
-                logger.info('%s: data not offloaded', f_name)
-            else:
-                logger.info('%s: data offloaded in %s', f_name, self.filename)
-        if self.datastore['type'] is None:
-            logger.debug('%s: datastore: %s', f_name, self.datastore)
-            return {'value': self.__value, 'datastore': self.datastore}  # fixme: avoid repeated serialize
+            self.__value = recursive_dict(self.__value)
+            if ioops.DATASTORE_CONFIG['type'] is not None:
+                b_thres = ioops.DATASTORE_CONFIG['inline-threshold']
+                b_size = get_json_size(self.__value, b_thres)
+                logger.debug('%s: data size [B]: %s', f_name, b_size)
+                logger.debug('%s: inline-threshold [B]: %s', f_name, b_thres)
+                if b_size > b_thres:
+                    logger.info('%s: inline data limit exceeded: %s', f_name, b_size)
+                    self.datastore, self.filename = ioops.offload_data(self.__value)
+                    assert self.datastore['type'] is not None
+                    logger.info('%s: data offloaded in %s', f_name, self.filename)
+                    return {'datastore': self.datastore, 'filename': self.filename}
+            self.datastore = {'type': None}
+            logger.info('%s: data not offloaded', f_name)
+            return {'value': self.__value, 'datastore': self.datastore}
         logger.debug('%s: datastore: %s', f_name, self.datastore)
+        if self.datastore['type'] is None:
+            return {'value': self.__value, 'datastore': self.datastore}
         logger.debug('%s: data in file: %s', f_name, self.filename)
         return {'datastore': self.datastore, 'filename': self.filename}
 
@@ -113,8 +110,9 @@ class FWDataObject(FWSerializable):
     def value(self):
         """restore the value if datastore is defined, otherwise just return"""
         assert self.datastore is not None
-        # if self.datastore is None:
-        #    return self.__value  # if created with from_obj, no known use case
+        # value created with from_obj must not be restored, no known use case
+        # if self.datastore is None:  # from_obj() sets datastore to None
+        #    return self.__value
 
         def restore_value(val):
             @recursive_deserialize
-- 
GitLab