schematic.models.metadata

  1import logging
  2import os
  3from os.path import exists
  4
  5# allows specifying explicit variable types
  6from typing import Any, Dict, List, Optional, Text
  7
  8import networkx as nx
  9from jsonschema import ValidationError
 10from opentelemetry import trace
 11
 12from schematic.manifest.generator import ManifestGenerator
 13from schematic.models.validate_manifest import validate_all
 14from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
 15from schematic.schemas.data_model_json_schema import DataModelJSONSchema
 16from schematic.schemas.data_model_parser import DataModelParser
 17
 18# TODO: This module should only be aware of the store interface
 19# we shouldn't need to expose Synapse functionality explicitly
 20from schematic.store.synapse import SynapseStorage
 21from schematic.utils.df_utils import load_df
 22
 23logger = logging.getLogger(__name__)
 24
 25tracer = trace.get_tracer("Schematic")
 26
 27
 28class MetadataModel(object):
 29    """Metadata model wrapper around schema.org specification graph.
 30
 31    Provides basic utilities to:
 32
 33    1) manipulate the metadata model
 34    2) generate metadata model views:
 35        - generate manifest view of the metadata model
 36        - generate validation schema view of the metadata model
 37    """
 38
 39    def __init__(
 40        self,
 41        inputMModelLocation: str,
 42        inputMModelLocationType: str,
 43        data_model_labels: str,
 44    ) -> None:
 45        """Instantiates a MetadataModel object.
 46
 47        Args:
 48            inputMModelLocation: local path, uri, synapse entity id (e.g. gs://, syn123, /User/x/…); present location
 49            inputMModelLocationType: specifier to indicate where the metadata model resource can be found (e.g. 'local' if file/JSON-LD is on local machine)
 50        """
 51        # extract extension of 'inputMModelLocation'
 52        # ensure that it is necessarily pointing to a '.jsonld' file
 53
 54        logger.debug(
 55            f"Initializing DataModelGraphExplorer object from {inputMModelLocation} schema."
 56        )
 57
 58        # self.inputMModelLocation remains for backwards compatibility
 59        self.inputMModelLocation = inputMModelLocation
 60        self.path_to_json_ld = inputMModelLocation
 61
 62        data_model_parser = DataModelParser(path_to_data_model=self.inputMModelLocation)
 63        # Parse Model
 64        parsed_data_model = data_model_parser.parse_model()
 65
 66        # Instantiate DataModelGraph
 67        data_model_grapher = DataModelGraph(parsed_data_model, data_model_labels)
 68
 69        # Generate graph
 70        self.graph_data_model = data_model_grapher.graph
 71
 72        self.dmge = DataModelGraphExplorer(self.graph_data_model)
 73
 74        # check if the type of MModel file is "local"
 75        # currently, the application only supports reading from local JSON-LD files
 76        if inputMModelLocationType == "local":
 77            self.inputMModelLocationType = inputMModelLocationType
 78        else:
 79            raise ValueError(
 80                f"The type '{inputMModelLocationType}' is currently not supported."
 81            )
 82
 83    def getModelSubgraph(self, rootNode: str, subgraphType: str) -> nx.DiGraph:
 84        """Gets a schema subgraph from rootNode descendants based on edge/node properties of type subgraphType.
 85
 86        Args:
 87            rootNode: a schema node label (i.e. term).
 88            subgraphType: the kind of subgraph to traverse (i.e. based on node properties or edge labels).
 89
 90        Returns:
 91            A directed subgraph (networkx DiGraph) of the metadata model with vertex set root node descendants.
 92
 93        Raises:
 94            ValueError: rootNode not found in metadata model.
 95        """
 96        pass
 97
 98    def getOrderedModelNodes(self, rootNode: str, relationshipType: str) -> List[str]:
 99        """Get a list of model objects ordered by their topological sort rank in a model subgraph on edges of a given relationship type.
100
101        Args:
102            rootNode: a schema object/node label (i.e. term)
103            relationshipType: edge label type of the schema subgraph (e.g. requiresDependency)
104
105        Returns:
106            An ordered list of objects, that are all descendants of rootNode.
107
108        Raises:
109            ValueError: rootNode not found in metadata model.
110        """
111        ordered_nodes = self.dmge.get_descendants_by_edge_type(
112            rootNode, relationshipType, connected=True, ordered=True
113        )
114
115        ordered_nodes.reverse()
116
117        return ordered_nodes
118
119    def getModelManifest(
120        self,
121        title: str,
122        rootNode: str,
123        datasetId: str = None,
124        jsonSchema: str = None,
125        filenames: list = None,
126        useAnnotations: bool = False,
127        sheetUrl: bool = True,
128    ) -> str:
129        """Gets data from the annotations manifest file.
130
131        TBD: Does this method belong here or in manifest generator?
132
133        Args:
134            rootNode: a schema node label (i.e. term).
135            useAnnotations: whether to populate manifest with current file annotations (True) or not (False, default).
136
137        Returns:
138            A manifest URI (assume Google doc for now).
139
140        Raises:
141            ValueError: rootNode not found in metadata model.
142        """
143        additionalMetadata = {}
144        if filenames:
145            additionalMetadata["Filename"] = filenames
146
147        mg = ManifestGenerator(
148            path_to_json_ld=self.inputMModelLocation,
149            graph=self.graph_data_model,
150            title=title,
151            root=rootNode,
152            additional_metadata=additionalMetadata,
153            use_annotations=useAnnotations,
154        )
155
156        if datasetId:
157            return mg.get_manifest(
158                dataset_id=datasetId, json_schema=jsonSchema, sheet_url=sheetUrl
159            )
160
161        return mg.get_manifest(sheet_url=sheetUrl)
162
163    def get_component_requirements(
164        self, source_component: str, as_graph: bool = False
165    ) -> List:
166        """Given a source model component (see https://w3id.org/biolink/vocab/category for definnition of component), return all components required by it.
167        Useful to construct requirement dependencies not only between specific attributes but also between categories/components of attributes;
168        Can be utilized to track metadata completion progress across multiple categories of attributes.
169
170        Args:
171            source_component: an attribute label indicating the source component.
172            as_graph: if False return component requirements as a list; if True return component requirements as a dependency graph (i.e. a DAG)
173
174        Returns:
175            A list of required components associated with the source component.
176        """
177
178        # get required components for the input/source component
179        req_components = self.dmge.get_component_requirements(source_component)
180
181        # retreive components as graph
182        if as_graph:
183            req_components_graph = self.dmge.get_component_requirements_graph(
184                source_component
185            )
186
187            # serialize component dependencies DAG to a edge list of node tuples
188            req_components = list(req_components_graph.edges())
189
190            return req_components
191
192        return req_components
193
194    # TODO: abstract validation in its own module
195    @tracer.start_as_current_span("MetadataModel::validateModelManifest")
196    def validateModelManifest(
197        self,
198        manifestPath: str,
199        rootNode: str,
200        restrict_rules: bool = False,
201        jsonSchema: Optional[str] = None,
202        project_scope: Optional[List] = None,
203        dataset_scope: Optional[str] = None,
204        access_token: Optional[str] = None,
205    ) -> tuple[list, list]:
206        """Check if provided annotations manifest dataframe satisfies all model requirements.
207
208        Args:
209            rootNode: a schema node label (i.e. term).
210            manifestPath: a path to the manifest csv file containing annotations.
211            restrict_rules: bypass great expectations and restrict rule options to those implemented in house
212
213        Returns:
214            A validation status message; if there is an error the message.
215            contains the manifest annotation record (i.e. row) that is invalid, along with the validation error associated with this record.
216
217        Raises:
218            ValueError: rootNode not found in metadata model.
219        """
220        # get validation schema for a given node in the data model, if the user has not provided input validation schema
221
222        if not jsonSchema:
223            # Instantiate Data Model Json Schema
224            self.data_model_js = DataModelJSONSchema(
225                jsonld_path=self.inputMModelLocation, graph=self.graph_data_model
226            )
227
228            jsonSchema = self.data_model_js.get_json_validation_schema(
229                rootNode, rootNode + "_validation"
230            )
231
232        errors = []
233        warnings = []
234
235        load_args = {
236            "dtype": "string",
237        }
238        # get annotations from manifest (array of json annotations corresponding to manifest rows)
239        manifest = load_df(
240            manifestPath,
241            preserve_raw_input=False,
242            allow_na_values=True,
243            **load_args,
244        )  # read manifest csv file as is from manifest path
245
246        # handler for mismatched components/data types
247        # throw TypeError if the value(s) in the "Component" column differ from the selected template type
248        if ("Component" in manifest.columns) and (
249            (len(manifest["Component"].unique()) > 1)
250            or (manifest["Component"].unique()[0] != rootNode)
251        ):
252            logging.error(
253                f"The 'Component' column value(s) {manifest['Component'].unique()} do not match the "
254                f"selected template type '{rootNode}'."
255            )
256
257            # row indexes for all rows where 'Component' is rootNode
258            row_idxs = manifest.index[manifest["Component"] != rootNode].tolist()
259            # column index value for the 'Component' column
260            col_idx = manifest.columns.get_loc("Component")
261            # Series with index and 'Component' values from manifest
262            mismatched_ser = manifest.iloc[row_idxs, col_idx]
263            for index, component in mismatched_ser.items():
264                errors.append(
265                    [
266                        index + 2,
267                        "Component",
268                        f"Component value provided is: '{component}', whereas the Template Type is: '{rootNode}'",
269                        # tuple of the component in the manifest and selected template type
270                        # check: R/Reticulate cannnot handle dicts? So returning tuple
271                        (component, rootNode),
272                    ]
273                )
274
275            return errors, warnings
276
277        errors, warnings, manifest = validate_all(
278            self,
279            errors=errors,
280            warnings=warnings,
281            manifest=manifest,
282            manifestPath=manifestPath,
283            dmge=self.dmge,
284            jsonSchema=jsonSchema,
285            restrict_rules=restrict_rules,
286            project_scope=project_scope,
287            dataset_scope=dataset_scope,
288            access_token=access_token,
289        )
290        return errors, warnings
291
292    def populateModelManifest(
293        self, title, manifestPath: str, rootNode: str, return_excel=False
294    ) -> str:
295        """Populate an existing annotations manifest based on a dataframe.
296            TODO: Remove this method; always use getModelManifest instead
297
298        Args:
299            rootNode: a schema node label (i.e. term).
300            manifestPath: a path to the manifest csv file containing annotations.
301
302        Returns:
303            A link to the filled in model manifest (e.g. google sheet).
304
305        Raises:
306            ValueError: rootNode not found in metadata model.
307        """
308        mg = ManifestGenerator(
309            path_to_data_model=self.inputMModelLocation,
310            graph=self.graph_data_model,
311            title=title,
312            root=rootNode,
313        )
314
315        emptyManifestURL = mg.get_manifest()
316
317        return mg.populate_manifest_spreadsheet(
318            manifestPath, emptyManifestURL, return_excel=return_excel, title=title
319        )
320
321    @tracer.start_as_current_span("MetadataModel::submit_metadata_manifest")
322    def submit_metadata_manifest(  # pylint: disable=too-many-arguments, too-many-locals
323        self,
324        manifest_path: str,
325        dataset_id: str,
326        manifest_record_type: str,
327        restrict_rules: bool,
328        access_token: Optional[str] = None,
329        validate_component: Optional[str] = None,
330        file_annotations_upload: bool = True,
331        hide_blanks: bool = False,
332        project_scope: Optional[list] = None,
333        dataset_scope: Optional[str] = None,
334        table_manipulation: str = "replace",
335        table_column_names: str = "class_label",
336        annotation_keys: str = "class_label",
337    ) -> str:
338        """
339        Wrap methods that are responsible for validation of manifests for a given component,
340          and association of the same manifest file with a specified dataset.
341
342        Args:
343            manifest_path (str): Path to the manifest file, which contains the metadata.
344            dataset_id (str): Synapse ID of the dataset on Synapse containing the
345              metadata manifest file.
346            manifest_record_type (str): How the manifest is stored in Synapse
347            restrict_rules (bool):
348              If True: bypass great expectations and restrict rule options to
349                those implemented in house
350            access_token (Optional[str], optional): Defaults to None.
351            validate_component (Optional[str], optional): Component from the schema.org
352              schema based on which the manifest template has been generated.
353            file_annotations_upload (bool, optional): Default to True. If false, do
354              not add annotations to files. Defaults to True.
355            hide_blanks (bool, optional): Defaults to False.
356            project_scope (Optional[list], optional): Defaults to None.
357            table_manipulation (str, optional): Defaults to "replace".
358            table_column_names (str, optional): Defaults to "class_label".
359            annotation_keys (str, optional): Defaults to "class_label".
360
361        Raises:
362            ValueError: When validate_component is provided, but it cannot be found in the schema.
363            ValidationError: If validation against data model was not successful.
364
365        Returns:
366            str: If both validation and association were successful.
367        """
368        # TODO: avoid explicitly exposing Synapse store functionality
369        # just instantiate a Store class and let it decide at runtime/config
370        # the store type
371        syn_store = SynapseStorage(
372            access_token=access_token, project_scope=project_scope
373        )
374        manifest_id = None
375        restrict_maniest = False
376        censored_manifest_path = manifest_path.replace(".csv", "_censored.csv")
377        # check if user wants to perform validation or not
378        if validate_component is not None:
379            try:
380                # check if the component ("class" in schema) passed as argument is valid
381                # (present in schema) or not
382                self.dmge.is_class_in_schema(validate_component)
383            except Exception as exc:
384                # a KeyError exception is raised when validate_component fails in the
385                # try-block above here, we are suppressing the KeyError exception and
386                # replacing it with a more descriptive ValueError exception
387                raise ValueError(
388                    f"The component '{validate_component}' could not be found "
389                    f"in the schema here '{self.path_to_json_ld}'"
390                ) from exc
391
392            # automatic JSON schema generation and validation with that JSON schema
393            val_errors, _ = self.validateModelManifest(
394                manifestPath=manifest_path,
395                rootNode=validate_component,
396                restrict_rules=restrict_rules,
397                project_scope=project_scope,
398                dataset_scope=dataset_scope,
399                access_token=access_token,
400            )
401
402            # if there are no errors in validation process
403            if val_errors == []:
404                # upload manifest file from `manifest_path` path to entity with Syn ID `dataset_id`
405                if os.path.exists(censored_manifest_path):
406                    syn_store.associateMetadataWithFiles(
407                        dmge=self.dmge,
408                        metadataManifestPath=censored_manifest_path,
409                        datasetId=dataset_id,
410                        manifest_record_type=manifest_record_type,
411                        hideBlanks=hide_blanks,
412                        table_manipulation=table_manipulation,
413                        table_column_names=table_column_names,
414                        annotation_keys=annotation_keys,
415                        file_annotations_upload=file_annotations_upload,
416                    )
417                    restrict_maniest = True
418
419                manifest_id = syn_store.associateMetadataWithFiles(
420                    dmge=self.dmge,
421                    metadataManifestPath=manifest_path,
422                    datasetId=dataset_id,
423                    manifest_record_type=manifest_record_type,
424                    hideBlanks=hide_blanks,
425                    restrict_manifest=restrict_maniest,
426                    table_manipulation=table_manipulation,
427                    table_column_names=table_column_names,
428                    annotation_keys=annotation_keys,
429                    file_annotations_upload=file_annotations_upload,
430                )
431
432                logger.info("No validation errors occured during validation.")
433                return manifest_id
434
435            else:
436                raise ValidationError(
437                    "Manifest could not be validated under provided data model. "
438                    f"Validation failed with the following errors: {val_errors}"
439                )
440
441        # no need to perform validation, just submit/associate the metadata manifest file
442        if os.path.exists(censored_manifest_path):
443            syn_store.associateMetadataWithFiles(
444                dmge=self.dmge,
445                metadataManifestPath=censored_manifest_path,
446                datasetId=dataset_id,
447                manifest_record_type=manifest_record_type,
448                hideBlanks=hide_blanks,
449                table_manipulation=table_manipulation,
450                table_column_names=table_column_names,
451                annotation_keys=annotation_keys,
452                file_annotations_upload=file_annotations_upload,
453            )
454            restrict_maniest = True
455
456        manifest_id = syn_store.associateMetadataWithFiles(
457            dmge=self.dmge,
458            metadataManifestPath=manifest_path,
459            datasetId=dataset_id,
460            manifest_record_type=manifest_record_type,
461            hideBlanks=hide_blanks,
462            restrict_manifest=restrict_maniest,
463            table_manipulation=table_manipulation,
464            table_column_names=table_column_names,
465            annotation_keys=annotation_keys,
466            file_annotations_upload=file_annotations_upload,
467        )
468
469        logger.debug(
470            "Optional validation was not performed on manifest before association."
471        )
472
473        return manifest_id
logger = <Logger schematic.models.metadata (WARNING)>
tracer = <opentelemetry.trace.ProxyTracer object>
class MetadataModel:
 29class MetadataModel(object):
 30    """Metadata model wrapper around schema.org specification graph.
 31
 32    Provides basic utilities to:
 33
 34    1) manipulate the metadata model
 35    2) generate metadata model views:
 36        - generate manifest view of the metadata model
 37        - generate validation schema view of the metadata model
 38    """
 39
 40    def __init__(
 41        self,
 42        inputMModelLocation: str,
 43        inputMModelLocationType: str,
 44        data_model_labels: str,
 45    ) -> None:
 46        """Instantiates a MetadataModel object.
 47
 48        Args:
 49            inputMModelLocation: local path, uri, synapse entity id (e.g. gs://, syn123, /User/x/…); present location
 50            inputMModelLocationType: specifier to indicate where the metadata model resource can be found (e.g. 'local' if file/JSON-LD is on local machine)
 51        """
 52        # extract extension of 'inputMModelLocation'
 53        # ensure that it is necessarily pointing to a '.jsonld' file
 54
 55        logger.debug(
 56            f"Initializing DataModelGraphExplorer object from {inputMModelLocation} schema."
 57        )
 58
 59        # self.inputMModelLocation remains for backwards compatibility
 60        self.inputMModelLocation = inputMModelLocation
 61        self.path_to_json_ld = inputMModelLocation
 62
 63        data_model_parser = DataModelParser(path_to_data_model=self.inputMModelLocation)
 64        # Parse Model
 65        parsed_data_model = data_model_parser.parse_model()
 66
 67        # Instantiate DataModelGraph
 68        data_model_grapher = DataModelGraph(parsed_data_model, data_model_labels)
 69
 70        # Generate graph
 71        self.graph_data_model = data_model_grapher.graph
 72
 73        self.dmge = DataModelGraphExplorer(self.graph_data_model)
 74
 75        # check if the type of MModel file is "local"
 76        # currently, the application only supports reading from local JSON-LD files
 77        if inputMModelLocationType == "local":
 78            self.inputMModelLocationType = inputMModelLocationType
 79        else:
 80            raise ValueError(
 81                f"The type '{inputMModelLocationType}' is currently not supported."
 82            )
 83
 84    def getModelSubgraph(self, rootNode: str, subgraphType: str) -> nx.DiGraph:
 85        """Gets a schema subgraph from rootNode descendants based on edge/node properties of type subgraphType.
 86
 87        Args:
 88            rootNode: a schema node label (i.e. term).
 89            subgraphType: the kind of subgraph to traverse (i.e. based on node properties or edge labels).
 90
 91        Returns:
 92            A directed subgraph (networkx DiGraph) of the metadata model with vertex set root node descendants.
 93
 94        Raises:
 95            ValueError: rootNode not found in metadata model.
 96        """
 97        pass
 98
 99    def getOrderedModelNodes(self, rootNode: str, relationshipType: str) -> List[str]:
100        """Get a list of model objects ordered by their topological sort rank in a model subgraph on edges of a given relationship type.
101
102        Args:
103            rootNode: a schema object/node label (i.e. term)
104            relationshipType: edge label type of the schema subgraph (e.g. requiresDependency)
105
106        Returns:
107            An ordered list of objects, that are all descendants of rootNode.
108
109        Raises:
110            ValueError: rootNode not found in metadata model.
111        """
112        ordered_nodes = self.dmge.get_descendants_by_edge_type(
113            rootNode, relationshipType, connected=True, ordered=True
114        )
115
116        ordered_nodes.reverse()
117
118        return ordered_nodes
119
120    def getModelManifest(
121        self,
122        title: str,
123        rootNode: str,
124        datasetId: str = None,
125        jsonSchema: str = None,
126        filenames: list = None,
127        useAnnotations: bool = False,
128        sheetUrl: bool = True,
129    ) -> str:
130        """Gets data from the annotations manifest file.
131
132        TBD: Does this method belong here or in manifest generator?
133
134        Args:
135            rootNode: a schema node label (i.e. term).
136            useAnnotations: whether to populate manifest with current file annotations (True) or not (False, default).
137
138        Returns:
139            A manifest URI (assume Google doc for now).
140
141        Raises:
142            ValueError: rootNode not found in metadata model.
143        """
144        additionalMetadata = {}
145        if filenames:
146            additionalMetadata["Filename"] = filenames
147
148        mg = ManifestGenerator(
149            path_to_json_ld=self.inputMModelLocation,
150            graph=self.graph_data_model,
151            title=title,
152            root=rootNode,
153            additional_metadata=additionalMetadata,
154            use_annotations=useAnnotations,
155        )
156
157        if datasetId:
158            return mg.get_manifest(
159                dataset_id=datasetId, json_schema=jsonSchema, sheet_url=sheetUrl
160            )
161
162        return mg.get_manifest(sheet_url=sheetUrl)
163
164    def get_component_requirements(
165        self, source_component: str, as_graph: bool = False
166    ) -> List:
167        """Given a source model component (see https://w3id.org/biolink/vocab/category for definnition of component), return all components required by it.
168        Useful to construct requirement dependencies not only between specific attributes but also between categories/components of attributes;
169        Can be utilized to track metadata completion progress across multiple categories of attributes.
170
171        Args:
172            source_component: an attribute label indicating the source component.
173            as_graph: if False return component requirements as a list; if True return component requirements as a dependency graph (i.e. a DAG)
174
175        Returns:
176            A list of required components associated with the source component.
177        """
178
179        # get required components for the input/source component
180        req_components = self.dmge.get_component_requirements(source_component)
181
182        # retreive components as graph
183        if as_graph:
184            req_components_graph = self.dmge.get_component_requirements_graph(
185                source_component
186            )
187
188            # serialize component dependencies DAG to a edge list of node tuples
189            req_components = list(req_components_graph.edges())
190
191            return req_components
192
193        return req_components
194
195    # TODO: abstract validation in its own module
196    @tracer.start_as_current_span("MetadataModel::validateModelManifest")
197    def validateModelManifest(
198        self,
199        manifestPath: str,
200        rootNode: str,
201        restrict_rules: bool = False,
202        jsonSchema: Optional[str] = None,
203        project_scope: Optional[List] = None,
204        dataset_scope: Optional[str] = None,
205        access_token: Optional[str] = None,
206    ) -> tuple[list, list]:
207        """Check if provided annotations manifest dataframe satisfies all model requirements.
208
209        Args:
210            rootNode: a schema node label (i.e. term).
211            manifestPath: a path to the manifest csv file containing annotations.
212            restrict_rules: bypass great expectations and restrict rule options to those implemented in house
213
214        Returns:
215            A validation status message; if there is an error the message.
216            contains the manifest annotation record (i.e. row) that is invalid, along with the validation error associated with this record.
217
218        Raises:
219            ValueError: rootNode not found in metadata model.
220        """
221        # get validation schema for a given node in the data model, if the user has not provided input validation schema
222
223        if not jsonSchema:
224            # Instantiate Data Model Json Schema
225            self.data_model_js = DataModelJSONSchema(
226                jsonld_path=self.inputMModelLocation, graph=self.graph_data_model
227            )
228
229            jsonSchema = self.data_model_js.get_json_validation_schema(
230                rootNode, rootNode + "_validation"
231            )
232
233        errors = []
234        warnings = []
235
236        load_args = {
237            "dtype": "string",
238        }
239        # get annotations from manifest (array of json annotations corresponding to manifest rows)
240        manifest = load_df(
241            manifestPath,
242            preserve_raw_input=False,
243            allow_na_values=True,
244            **load_args,
245        )  # read manifest csv file as is from manifest path
246
247        # handler for mismatched components/data types
248        # throw TypeError if the value(s) in the "Component" column differ from the selected template type
249        if ("Component" in manifest.columns) and (
250            (len(manifest["Component"].unique()) > 1)
251            or (manifest["Component"].unique()[0] != rootNode)
252        ):
253            logging.error(
254                f"The 'Component' column value(s) {manifest['Component'].unique()} do not match the "
255                f"selected template type '{rootNode}'."
256            )
257
258            # row indexes for all rows where 'Component' is rootNode
259            row_idxs = manifest.index[manifest["Component"] != rootNode].tolist()
260            # column index value for the 'Component' column
261            col_idx = manifest.columns.get_loc("Component")
262            # Series with index and 'Component' values from manifest
263            mismatched_ser = manifest.iloc[row_idxs, col_idx]
264            for index, component in mismatched_ser.items():
265                errors.append(
266                    [
267                        index + 2,
268                        "Component",
269                        f"Component value provided is: '{component}', whereas the Template Type is: '{rootNode}'",
270                        # tuple of the component in the manifest and selected template type
271                        # check: R/Reticulate cannnot handle dicts? So returning tuple
272                        (component, rootNode),
273                    ]
274                )
275
276            return errors, warnings
277
278        errors, warnings, manifest = validate_all(
279            self,
280            errors=errors,
281            warnings=warnings,
282            manifest=manifest,
283            manifestPath=manifestPath,
284            dmge=self.dmge,
285            jsonSchema=jsonSchema,
286            restrict_rules=restrict_rules,
287            project_scope=project_scope,
288            dataset_scope=dataset_scope,
289            access_token=access_token,
290        )
291        return errors, warnings
292
293    def populateModelManifest(
294        self, title, manifestPath: str, rootNode: str, return_excel=False
295    ) -> str:
296        """Populate an existing annotations manifest based on a dataframe.
297            TODO: Remove this method; always use getModelManifest instead
298
299        Args:
300            rootNode: a schema node label (i.e. term).
301            manifestPath: a path to the manifest csv file containing annotations.
302
303        Returns:
304            A link to the filled in model manifest (e.g. google sheet).
305
306        Raises:
307            ValueError: rootNode not found in metadata model.
308        """
309        mg = ManifestGenerator(
310            path_to_data_model=self.inputMModelLocation,
311            graph=self.graph_data_model,
312            title=title,
313            root=rootNode,
314        )
315
316        emptyManifestURL = mg.get_manifest()
317
318        return mg.populate_manifest_spreadsheet(
319            manifestPath, emptyManifestURL, return_excel=return_excel, title=title
320        )
321
322    @tracer.start_as_current_span("MetadataModel::submit_metadata_manifest")
323    def submit_metadata_manifest(  # pylint: disable=too-many-arguments, too-many-locals
324        self,
325        manifest_path: str,
326        dataset_id: str,
327        manifest_record_type: str,
328        restrict_rules: bool,
329        access_token: Optional[str] = None,
330        validate_component: Optional[str] = None,
331        file_annotations_upload: bool = True,
332        hide_blanks: bool = False,
333        project_scope: Optional[list] = None,
334        dataset_scope: Optional[str] = None,
335        table_manipulation: str = "replace",
336        table_column_names: str = "class_label",
337        annotation_keys: str = "class_label",
338    ) -> str:
339        """
340        Wrap methods that are responsible for validation of manifests for a given component,
341          and association of the same manifest file with a specified dataset.
342
343        Args:
344            manifest_path (str): Path to the manifest file, which contains the metadata.
345            dataset_id (str): Synapse ID of the dataset on Synapse containing the
346              metadata manifest file.
347            manifest_record_type (str): How the manifest is stored in Synapse
348            restrict_rules (bool):
349              If True: bypass great expectations and restrict rule options to
350                those implemented in house
351            access_token (Optional[str], optional): Defaults to None.
352            validate_component (Optional[str], optional): Component from the schema.org
353              schema based on which the manifest template has been generated.
354            file_annotations_upload (bool, optional): Default to True. If false, do
355              not add annotations to files. Defaults to True.
356            hide_blanks (bool, optional): Defaults to False.
357            project_scope (Optional[list], optional): Defaults to None.
358            table_manipulation (str, optional): Defaults to "replace".
359            table_column_names (str, optional): Defaults to "class_label".
360            annotation_keys (str, optional): Defaults to "class_label".
361
362        Raises:
363            ValueError: When validate_component is provided, but it cannot be found in the schema.
364            ValidationError: If validation against data model was not successful.
365
366        Returns:
367            str: If both validation and association were successful.
368        """
369        # TODO: avoid explicitly exposing Synapse store functionality
370        # just instantiate a Store class and let it decide at runtime/config
371        # the store type
372        syn_store = SynapseStorage(
373            access_token=access_token, project_scope=project_scope
374        )
375        manifest_id = None
376        restrict_maniest = False
377        censored_manifest_path = manifest_path.replace(".csv", "_censored.csv")
378        # check if user wants to perform validation or not
379        if validate_component is not None:
380            try:
381                # check if the component ("class" in schema) passed as argument is valid
382                # (present in schema) or not
383                self.dmge.is_class_in_schema(validate_component)
384            except Exception as exc:
385                # a KeyError exception is raised when validate_component fails in the
386                # try-block above here, we are suppressing the KeyError exception and
387                # replacing it with a more descriptive ValueError exception
388                raise ValueError(
389                    f"The component '{validate_component}' could not be found "
390                    f"in the schema here '{self.path_to_json_ld}'"
391                ) from exc
392
393            # automatic JSON schema generation and validation with that JSON schema
394            val_errors, _ = self.validateModelManifest(
395                manifestPath=manifest_path,
396                rootNode=validate_component,
397                restrict_rules=restrict_rules,
398                project_scope=project_scope,
399                dataset_scope=dataset_scope,
400                access_token=access_token,
401            )
402
403            # if there are no errors in validation process
404            if val_errors == []:
405                # upload manifest file from `manifest_path` path to entity with Syn ID `dataset_id`
406                if os.path.exists(censored_manifest_path):
407                    syn_store.associateMetadataWithFiles(
408                        dmge=self.dmge,
409                        metadataManifestPath=censored_manifest_path,
410                        datasetId=dataset_id,
411                        manifest_record_type=manifest_record_type,
412                        hideBlanks=hide_blanks,
413                        table_manipulation=table_manipulation,
414                        table_column_names=table_column_names,
415                        annotation_keys=annotation_keys,
416                        file_annotations_upload=file_annotations_upload,
417                    )
418                    restrict_maniest = True
419
420                manifest_id = syn_store.associateMetadataWithFiles(
421                    dmge=self.dmge,
422                    metadataManifestPath=manifest_path,
423                    datasetId=dataset_id,
424                    manifest_record_type=manifest_record_type,
425                    hideBlanks=hide_blanks,
426                    restrict_manifest=restrict_maniest,
427                    table_manipulation=table_manipulation,
428                    table_column_names=table_column_names,
429                    annotation_keys=annotation_keys,
430                    file_annotations_upload=file_annotations_upload,
431                )
432
433                logger.info("No validation errors occured during validation.")
434                return manifest_id
435
436            else:
437                raise ValidationError(
438                    "Manifest could not be validated under provided data model. "
439                    f"Validation failed with the following errors: {val_errors}"
440                )
441
442        # no need to perform validation, just submit/associate the metadata manifest file
443        if os.path.exists(censored_manifest_path):
444            syn_store.associateMetadataWithFiles(
445                dmge=self.dmge,
446                metadataManifestPath=censored_manifest_path,
447                datasetId=dataset_id,
448                manifest_record_type=manifest_record_type,
449                hideBlanks=hide_blanks,
450                table_manipulation=table_manipulation,
451                table_column_names=table_column_names,
452                annotation_keys=annotation_keys,
453                file_annotations_upload=file_annotations_upload,
454            )
455            restrict_maniest = True
456
457        manifest_id = syn_store.associateMetadataWithFiles(
458            dmge=self.dmge,
459            metadataManifestPath=manifest_path,
460            datasetId=dataset_id,
461            manifest_record_type=manifest_record_type,
462            hideBlanks=hide_blanks,
463            restrict_manifest=restrict_maniest,
464            table_manipulation=table_manipulation,
465            table_column_names=table_column_names,
466            annotation_keys=annotation_keys,
467            file_annotations_upload=file_annotations_upload,
468        )
469
470        logger.debug(
471            "Optional validation was not performed on manifest before association."
472        )
473
474        return manifest_id

Metadata model wrapper around schema.org specification graph.

Provides basic utilities to:

1) manipulate the metadata model 2) generate metadata model views: - generate manifest view of the metadata model - generate validation schema view of the metadata model

MetadataModel( inputMModelLocation: str, inputMModelLocationType: str, data_model_labels: str)
40    def __init__(
41        self,
42        inputMModelLocation: str,
43        inputMModelLocationType: str,
44        data_model_labels: str,
45    ) -> None:
46        """Instantiates a MetadataModel object.
47
48        Args:
49            inputMModelLocation: local path, uri, synapse entity id (e.g. gs://, syn123, /User/x/…); present location
50            inputMModelLocationType: specifier to indicate where the metadata model resource can be found (e.g. 'local' if file/JSON-LD is on local machine)
51        """
52        # extract extension of 'inputMModelLocation'
53        # ensure that it is necessarily pointing to a '.jsonld' file
54
55        logger.debug(
56            f"Initializing DataModelGraphExplorer object from {inputMModelLocation} schema."
57        )
58
59        # self.inputMModelLocation remains for backwards compatibility
60        self.inputMModelLocation = inputMModelLocation
61        self.path_to_json_ld = inputMModelLocation
62
63        data_model_parser = DataModelParser(path_to_data_model=self.inputMModelLocation)
64        # Parse Model
65        parsed_data_model = data_model_parser.parse_model()
66
67        # Instantiate DataModelGraph
68        data_model_grapher = DataModelGraph(parsed_data_model, data_model_labels)
69
70        # Generate graph
71        self.graph_data_model = data_model_grapher.graph
72
73        self.dmge = DataModelGraphExplorer(self.graph_data_model)
74
75        # check if the type of MModel file is "local"
76        # currently, the application only supports reading from local JSON-LD files
77        if inputMModelLocationType == "local":
78            self.inputMModelLocationType = inputMModelLocationType
79        else:
80            raise ValueError(
81                f"The type '{inputMModelLocationType}' is currently not supported."
82            )

Instantiates a MetadataModel object.

Arguments:
  • inputMModelLocation: local path, uri, synapse entity id (e.g. gs://, syn123, /User/x/…); present location
  • inputMModelLocationType: specifier to indicate where the metadata model resource can be found (e.g. 'local' if file/JSON-LD is on local machine)
inputMModelLocation
path_to_json_ld
graph_data_model
dmge
def getModelSubgraph( self, rootNode: str, subgraphType: str) -> networkx.classes.digraph.DiGraph:
84    def getModelSubgraph(self, rootNode: str, subgraphType: str) -> nx.DiGraph:
85        """Gets a schema subgraph from rootNode descendants based on edge/node properties of type subgraphType.
86
87        Args:
88            rootNode: a schema node label (i.e. term).
89            subgraphType: the kind of subgraph to traverse (i.e. based on node properties or edge labels).
90
91        Returns:
92            A directed subgraph (networkx DiGraph) of the metadata model with vertex set root node descendants.
93
94        Raises:
95            ValueError: rootNode not found in metadata model.
96        """
97        pass

Gets a schema subgraph from rootNode descendants based on edge/node properties of type subgraphType.

Arguments:
  • rootNode: a schema node label (i.e. term).
  • subgraphType: the kind of subgraph to traverse (i.e. based on node properties or edge labels).
Returns:

A directed subgraph (networkx DiGraph) of the metadata model with vertex set root node descendants.

Raises:
  • ValueError: rootNode not found in metadata model.
def getOrderedModelNodes(self, rootNode: str, relationshipType: str) -> List[str]:
 99    def getOrderedModelNodes(self, rootNode: str, relationshipType: str) -> List[str]:
100        """Get a list of model objects ordered by their topological sort rank in a model subgraph on edges of a given relationship type.
101
102        Args:
103            rootNode: a schema object/node label (i.e. term)
104            relationshipType: edge label type of the schema subgraph (e.g. requiresDependency)
105
106        Returns:
107            An ordered list of objects, that are all descendants of rootNode.
108
109        Raises:
110            ValueError: rootNode not found in metadata model.
111        """
112        ordered_nodes = self.dmge.get_descendants_by_edge_type(
113            rootNode, relationshipType, connected=True, ordered=True
114        )
115
116        ordered_nodes.reverse()
117
118        return ordered_nodes

Get a list of model objects ordered by their topological sort rank in a model subgraph on edges of a given relationship type.

Arguments:
  • rootNode: a schema object/node label (i.e. term)
  • relationshipType: edge label type of the schema subgraph (e.g. requiresDependency)
Returns:

An ordered list of objects, that are all descendants of rootNode.

Raises:
  • ValueError: rootNode not found in metadata model.
def getModelManifest( self, title: str, rootNode: str, datasetId: str = None, jsonSchema: str = None, filenames: list = None, useAnnotations: bool = False, sheetUrl: bool = True) -> str:
120    def getModelManifest(
121        self,
122        title: str,
123        rootNode: str,
124        datasetId: str = None,
125        jsonSchema: str = None,
126        filenames: list = None,
127        useAnnotations: bool = False,
128        sheetUrl: bool = True,
129    ) -> str:
130        """Gets data from the annotations manifest file.
131
132        TBD: Does this method belong here or in manifest generator?
133
134        Args:
135            rootNode: a schema node label (i.e. term).
136            useAnnotations: whether to populate manifest with current file annotations (True) or not (False, default).
137
138        Returns:
139            A manifest URI (assume Google doc for now).
140
141        Raises:
142            ValueError: rootNode not found in metadata model.
143        """
144        additionalMetadata = {}
145        if filenames:
146            additionalMetadata["Filename"] = filenames
147
148        mg = ManifestGenerator(
149            path_to_json_ld=self.inputMModelLocation,
150            graph=self.graph_data_model,
151            title=title,
152            root=rootNode,
153            additional_metadata=additionalMetadata,
154            use_annotations=useAnnotations,
155        )
156
157        if datasetId:
158            return mg.get_manifest(
159                dataset_id=datasetId, json_schema=jsonSchema, sheet_url=sheetUrl
160            )
161
162        return mg.get_manifest(sheet_url=sheetUrl)

Gets data from the annotations manifest file.

TBD: Does this method belong here or in manifest generator?

Arguments:
  • rootNode: a schema node label (i.e. term).
  • useAnnotations: whether to populate manifest with current file annotations (True) or not (False, default).
Returns:

A manifest URI (assume Google doc for now).

Raises:
  • ValueError: rootNode not found in metadata model.
def get_component_requirements(self, source_component: str, as_graph: bool = False) -> List:
164    def get_component_requirements(
165        self, source_component: str, as_graph: bool = False
166    ) -> List:
167        """Given a source model component (see https://w3id.org/biolink/vocab/category for definnition of component), return all components required by it.
168        Useful to construct requirement dependencies not only between specific attributes but also between categories/components of attributes;
169        Can be utilized to track metadata completion progress across multiple categories of attributes.
170
171        Args:
172            source_component: an attribute label indicating the source component.
173            as_graph: if False return component requirements as a list; if True return component requirements as a dependency graph (i.e. a DAG)
174
175        Returns:
176            A list of required components associated with the source component.
177        """
178
179        # get required components for the input/source component
180        req_components = self.dmge.get_component_requirements(source_component)
181
182        # retreive components as graph
183        if as_graph:
184            req_components_graph = self.dmge.get_component_requirements_graph(
185                source_component
186            )
187
188            # serialize component dependencies DAG to a edge list of node tuples
189            req_components = list(req_components_graph.edges())
190
191            return req_components
192
193        return req_components

Given a source model component (see https://w3id.org/biolink/vocab/category for definnition of component), return all components required by it. Useful to construct requirement dependencies not only between specific attributes but also between categories/components of attributes; Can be utilized to track metadata completion progress across multiple categories of attributes.

Arguments:
  • source_component: an attribute label indicating the source component.
  • as_graph: if False return component requirements as a list; if True return component requirements as a dependency graph (i.e. a DAG)
Returns:

A list of required components associated with the source component.

@tracer.start_as_current_span('MetadataModel::validateModelManifest')
def validateModelManifest( self, manifestPath: str, rootNode: str, restrict_rules: bool = False, jsonSchema: Optional[str] = None, project_scope: Optional[List] = None, dataset_scope: Optional[str] = None, access_token: Optional[str] = None) -> tuple[list, list]:
196    @tracer.start_as_current_span("MetadataModel::validateModelManifest")
197    def validateModelManifest(
198        self,
199        manifestPath: str,
200        rootNode: str,
201        restrict_rules: bool = False,
202        jsonSchema: Optional[str] = None,
203        project_scope: Optional[List] = None,
204        dataset_scope: Optional[str] = None,
205        access_token: Optional[str] = None,
206    ) -> tuple[list, list]:
207        """Check if provided annotations manifest dataframe satisfies all model requirements.
208
209        Args:
210            rootNode: a schema node label (i.e. term).
211            manifestPath: a path to the manifest csv file containing annotations.
212            restrict_rules: bypass great expectations and restrict rule options to those implemented in house
213
214        Returns:
215            A validation status message; if there is an error the message.
216            contains the manifest annotation record (i.e. row) that is invalid, along with the validation error associated with this record.
217
218        Raises:
219            ValueError: rootNode not found in metadata model.
220        """
221        # get validation schema for a given node in the data model, if the user has not provided input validation schema
222
223        if not jsonSchema:
224            # Instantiate Data Model Json Schema
225            self.data_model_js = DataModelJSONSchema(
226                jsonld_path=self.inputMModelLocation, graph=self.graph_data_model
227            )
228
229            jsonSchema = self.data_model_js.get_json_validation_schema(
230                rootNode, rootNode + "_validation"
231            )
232
233        errors = []
234        warnings = []
235
236        load_args = {
237            "dtype": "string",
238        }
239        # get annotations from manifest (array of json annotations corresponding to manifest rows)
240        manifest = load_df(
241            manifestPath,
242            preserve_raw_input=False,
243            allow_na_values=True,
244            **load_args,
245        )  # read manifest csv file as is from manifest path
246
247        # handler for mismatched components/data types
248        # throw TypeError if the value(s) in the "Component" column differ from the selected template type
249        if ("Component" in manifest.columns) and (
250            (len(manifest["Component"].unique()) > 1)
251            or (manifest["Component"].unique()[0] != rootNode)
252        ):
253            logging.error(
254                f"The 'Component' column value(s) {manifest['Component'].unique()} do not match the "
255                f"selected template type '{rootNode}'."
256            )
257
258            # row indexes for all rows where 'Component' is rootNode
259            row_idxs = manifest.index[manifest["Component"] != rootNode].tolist()
260            # column index value for the 'Component' column
261            col_idx = manifest.columns.get_loc("Component")
262            # Series with index and 'Component' values from manifest
263            mismatched_ser = manifest.iloc[row_idxs, col_idx]
264            for index, component in mismatched_ser.items():
265                errors.append(
266                    [
267                        index + 2,
268                        "Component",
269                        f"Component value provided is: '{component}', whereas the Template Type is: '{rootNode}'",
270                        # tuple of the component in the manifest and selected template type
271                        # check: R/Reticulate cannnot handle dicts? So returning tuple
272                        (component, rootNode),
273                    ]
274                )
275
276            return errors, warnings
277
278        errors, warnings, manifest = validate_all(
279            self,
280            errors=errors,
281            warnings=warnings,
282            manifest=manifest,
283            manifestPath=manifestPath,
284            dmge=self.dmge,
285            jsonSchema=jsonSchema,
286            restrict_rules=restrict_rules,
287            project_scope=project_scope,
288            dataset_scope=dataset_scope,
289            access_token=access_token,
290        )
291        return errors, warnings

Check if provided annotations manifest dataframe satisfies all model requirements.

Arguments:
  • rootNode: a schema node label (i.e. term).
  • manifestPath: a path to the manifest csv file containing annotations.
  • restrict_rules: bypass great expectations and restrict rule options to those implemented in house
Returns:

A validation status message; if there is an error the message. contains the manifest annotation record (i.e. row) that is invalid, along with the validation error associated with this record.

Raises:
  • ValueError: rootNode not found in metadata model.
def populateModelManifest(self, title, manifestPath: str, rootNode: str, return_excel=False) -> str:
293    def populateModelManifest(
294        self, title, manifestPath: str, rootNode: str, return_excel=False
295    ) -> str:
296        """Populate an existing annotations manifest based on a dataframe.
297            TODO: Remove this method; always use getModelManifest instead
298
299        Args:
300            rootNode: a schema node label (i.e. term).
301            manifestPath: a path to the manifest csv file containing annotations.
302
303        Returns:
304            A link to the filled in model manifest (e.g. google sheet).
305
306        Raises:
307            ValueError: rootNode not found in metadata model.
308        """
309        mg = ManifestGenerator(
310            path_to_data_model=self.inputMModelLocation,
311            graph=self.graph_data_model,
312            title=title,
313            root=rootNode,
314        )
315
316        emptyManifestURL = mg.get_manifest()
317
318        return mg.populate_manifest_spreadsheet(
319            manifestPath, emptyManifestURL, return_excel=return_excel, title=title
320        )

Populate an existing annotations manifest based on a dataframe. TODO: Remove this method; always use getModelManifest instead

Arguments:
  • rootNode: a schema node label (i.e. term).
  • manifestPath: a path to the manifest csv file containing annotations.
Returns:

A link to the filled in model manifest (e.g. google sheet).

Raises:
  • ValueError: rootNode not found in metadata model.
@tracer.start_as_current_span('MetadataModel::submit_metadata_manifest')
def submit_metadata_manifest( self, manifest_path: str, dataset_id: str, manifest_record_type: str, restrict_rules: bool, access_token: Optional[str] = None, validate_component: Optional[str] = None, file_annotations_upload: bool = True, hide_blanks: bool = False, project_scope: Optional[list] = None, dataset_scope: Optional[str] = None, table_manipulation: str = 'replace', table_column_names: str = 'class_label', annotation_keys: str = 'class_label') -> str:
322    @tracer.start_as_current_span("MetadataModel::submit_metadata_manifest")
323    def submit_metadata_manifest(  # pylint: disable=too-many-arguments, too-many-locals
324        self,
325        manifest_path: str,
326        dataset_id: str,
327        manifest_record_type: str,
328        restrict_rules: bool,
329        access_token: Optional[str] = None,
330        validate_component: Optional[str] = None,
331        file_annotations_upload: bool = True,
332        hide_blanks: bool = False,
333        project_scope: Optional[list] = None,
334        dataset_scope: Optional[str] = None,
335        table_manipulation: str = "replace",
336        table_column_names: str = "class_label",
337        annotation_keys: str = "class_label",
338    ) -> str:
339        """
340        Wrap methods that are responsible for validation of manifests for a given component,
341          and association of the same manifest file with a specified dataset.
342
343        Args:
344            manifest_path (str): Path to the manifest file, which contains the metadata.
345            dataset_id (str): Synapse ID of the dataset on Synapse containing the
346              metadata manifest file.
347            manifest_record_type (str): How the manifest is stored in Synapse
348            restrict_rules (bool):
349              If True: bypass great expectations and restrict rule options to
350                those implemented in house
351            access_token (Optional[str], optional): Defaults to None.
352            validate_component (Optional[str], optional): Component from the schema.org
353              schema based on which the manifest template has been generated.
354            file_annotations_upload (bool, optional): Default to True. If false, do
355              not add annotations to files. Defaults to True.
356            hide_blanks (bool, optional): Defaults to False.
357            project_scope (Optional[list], optional): Defaults to None.
358            table_manipulation (str, optional): Defaults to "replace".
359            table_column_names (str, optional): Defaults to "class_label".
360            annotation_keys (str, optional): Defaults to "class_label".
361
362        Raises:
363            ValueError: When validate_component is provided, but it cannot be found in the schema.
364            ValidationError: If validation against data model was not successful.
365
366        Returns:
367            str: If both validation and association were successful.
368        """
369        # TODO: avoid explicitly exposing Synapse store functionality
370        # just instantiate a Store class and let it decide at runtime/config
371        # the store type
372        syn_store = SynapseStorage(
373            access_token=access_token, project_scope=project_scope
374        )
375        manifest_id = None
376        restrict_maniest = False
377        censored_manifest_path = manifest_path.replace(".csv", "_censored.csv")
378        # check if user wants to perform validation or not
379        if validate_component is not None:
380            try:
381                # check if the component ("class" in schema) passed as argument is valid
382                # (present in schema) or not
383                self.dmge.is_class_in_schema(validate_component)
384            except Exception as exc:
385                # a KeyError exception is raised when validate_component fails in the
386                # try-block above here, we are suppressing the KeyError exception and
387                # replacing it with a more descriptive ValueError exception
388                raise ValueError(
389                    f"The component '{validate_component}' could not be found "
390                    f"in the schema here '{self.path_to_json_ld}'"
391                ) from exc
392
393            # automatic JSON schema generation and validation with that JSON schema
394            val_errors, _ = self.validateModelManifest(
395                manifestPath=manifest_path,
396                rootNode=validate_component,
397                restrict_rules=restrict_rules,
398                project_scope=project_scope,
399                dataset_scope=dataset_scope,
400                access_token=access_token,
401            )
402
403            # if there are no errors in validation process
404            if val_errors == []:
405                # upload manifest file from `manifest_path` path to entity with Syn ID `dataset_id`
406                if os.path.exists(censored_manifest_path):
407                    syn_store.associateMetadataWithFiles(
408                        dmge=self.dmge,
409                        metadataManifestPath=censored_manifest_path,
410                        datasetId=dataset_id,
411                        manifest_record_type=manifest_record_type,
412                        hideBlanks=hide_blanks,
413                        table_manipulation=table_manipulation,
414                        table_column_names=table_column_names,
415                        annotation_keys=annotation_keys,
416                        file_annotations_upload=file_annotations_upload,
417                    )
418                    restrict_maniest = True
419
420                manifest_id = syn_store.associateMetadataWithFiles(
421                    dmge=self.dmge,
422                    metadataManifestPath=manifest_path,
423                    datasetId=dataset_id,
424                    manifest_record_type=manifest_record_type,
425                    hideBlanks=hide_blanks,
426                    restrict_manifest=restrict_maniest,
427                    table_manipulation=table_manipulation,
428                    table_column_names=table_column_names,
429                    annotation_keys=annotation_keys,
430                    file_annotations_upload=file_annotations_upload,
431                )
432
433                logger.info("No validation errors occured during validation.")
434                return manifest_id
435
436            else:
437                raise ValidationError(
438                    "Manifest could not be validated under provided data model. "
439                    f"Validation failed with the following errors: {val_errors}"
440                )
441
442        # no need to perform validation, just submit/associate the metadata manifest file
443        if os.path.exists(censored_manifest_path):
444            syn_store.associateMetadataWithFiles(
445                dmge=self.dmge,
446                metadataManifestPath=censored_manifest_path,
447                datasetId=dataset_id,
448                manifest_record_type=manifest_record_type,
449                hideBlanks=hide_blanks,
450                table_manipulation=table_manipulation,
451                table_column_names=table_column_names,
452                annotation_keys=annotation_keys,
453                file_annotations_upload=file_annotations_upload,
454            )
455            restrict_maniest = True
456
457        manifest_id = syn_store.associateMetadataWithFiles(
458            dmge=self.dmge,
459            metadataManifestPath=manifest_path,
460            datasetId=dataset_id,
461            manifest_record_type=manifest_record_type,
462            hideBlanks=hide_blanks,
463            restrict_manifest=restrict_maniest,
464            table_manipulation=table_manipulation,
465            table_column_names=table_column_names,
466            annotation_keys=annotation_keys,
467            file_annotations_upload=file_annotations_upload,
468        )
469
470        logger.debug(
471            "Optional validation was not performed on manifest before association."
472        )
473
474        return manifest_id

Wrap methods that are responsible for validation of manifests for a given component, and association of the same manifest file with a specified dataset.

Arguments:
  • manifest_path (str): Path to the manifest file, which contains the metadata.
  • dataset_id (str): Synapse ID of the dataset on Synapse containing the metadata manifest file.
  • manifest_record_type (str): How the manifest is stored in Synapse
  • restrict_rules (bool): If True: bypass great expectations and restrict rule options to those implemented in house
  • access_token (Optional[str], optional): Defaults to None.
  • validate_component (Optional[str], optional): Component from the schema.org schema based on which the manifest template has been generated.
  • file_annotations_upload (bool, optional): Default to True. If false, do not add annotations to files. Defaults to True.
  • hide_blanks (bool, optional): Defaults to False.
  • project_scope (Optional[list], optional): Defaults to None.
  • table_manipulation (str, optional): Defaults to "replace".
  • table_column_names (str, optional): Defaults to "class_label".
  • annotation_keys (str, optional): Defaults to "class_label".
Raises:
  • ValueError: When validate_component is provided, but it cannot be found in the schema.
  • ValidationError: If validation against data model was not successful.
Returns:

str: If both validation and association were successful.