
  1import logging
  2import os
  3from os.path import exists
  5# allows specifying explicit variable types
  6from typing import Any, Dict, List, Optional, Text
  8import networkx as nx
  9from jsonschema import ValidationError
 10from opentelemetry import trace
 12from schematic.manifest.generator import ManifestGenerator
 13from schematic.models.validate_manifest import validate_all
 14from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
 15from schematic.schemas.data_model_json_schema import DataModelJSONSchema
 16from schematic.schemas.data_model_parser import DataModelParser
 18# TODO: This module should only be aware of the store interface
 19# we shouldn't need to expose Synapse functionality explicitly
 20from import SynapseStorage
 21from schematic.utils.df_utils import load_df
 23logger = logging.getLogger(__name__)
 25tracer = trace.get_tracer("Schematic")
 28class MetadataModel(object):
 29    """Metadata model wrapper around specification graph.
 31    Provides basic utilities to:
 33    1) manipulate the metadata model
 34    2) generate metadata model views:
 35        - generate manifest view of the metadata model
 36        - generate validation schema view of the metadata model
 37    """
 39    def __init__(
 40        self,
 41        inputMModelLocation: str,
 42        inputMModelLocationType: str,
 43        data_model_labels: str,
 44    ) -> None:
 45        """Instantiates a MetadataModel object.
 47        Args:
 48            inputMModelLocation: local path, uri, synapse entity id (e.g. gs://, syn123, /User/x/…); present location
 49            inputMModelLocationType: specifier to indicate where the metadata model resource can be found (e.g. 'local' if file/JSON-LD is on local machine)
 50        """
 51        # extract extension of 'inputMModelLocation'
 52        # ensure that it is necessarily pointing to a '.jsonld' file
 54        logger.debug(
 55            f"Initializing DataModelGraphExplorer object from {inputMModelLocation} schema."
 56        )
 58        # self.inputMModelLocation remains for backwards compatibility
 59        self.inputMModelLocation = inputMModelLocation
 60        self.path_to_json_ld = inputMModelLocation
 62        data_model_parser = DataModelParser(path_to_data_model=self.inputMModelLocation)
 63        # Parse Model
 64        parsed_data_model = data_model_parser.parse_model()
 66        # Instantiate DataModelGraph
 67        data_model_grapher = DataModelGraph(parsed_data_model, data_model_labels)
 69        # Generate graph
 70        self.graph_data_model = data_model_grapher.graph
 72        self.dmge = DataModelGraphExplorer(self.graph_data_model)
 74        # check if the type of MModel file is "local"
 75        # currently, the application only supports reading from local JSON-LD files
 76        if inputMModelLocationType == "local":
 77            self.inputMModelLocationType = inputMModelLocationType
 78        else:
 79            raise ValueError(
 80                f"The type '{inputMModelLocationType}' is currently not supported."
 81            )
 83    def getModelSubgraph(self, rootNode: str, subgraphType: str) -> nx.DiGraph:
 84        """Gets a schema subgraph from rootNode descendants based on edge/node properties of type subgraphType.
 86        Args:
 87            rootNode: a schema node label (i.e. term).
 88            subgraphType: the kind of subgraph to traverse (i.e. based on node properties or edge labels).
 90        Returns:
 91            A directed subgraph (networkx DiGraph) of the metadata model with vertex set root node descendants.
 93        Raises:
 94            ValueError: rootNode not found in metadata model.
 95        """
 96        pass
 98    def getOrderedModelNodes(self, rootNode: str, relationshipType: str) -> List[str]:
 99        """Get a list of model objects ordered by their topological sort rank in a model subgraph on edges of a given relationship type.
101        Args:
102            rootNode: a schema object/node label (i.e. term)
103            relationshipType: edge label type of the schema subgraph (e.g. requiresDependency)
105        Returns:
106            An ordered list of objects, that are all descendants of rootNode.
108        Raises:
109            ValueError: rootNode not found in metadata model.
110        """
111        ordered_nodes = self.dmge.get_descendants_by_edge_type(
112            rootNode, relationshipType, connected=True, ordered=True
113        )
115        ordered_nodes.reverse()
117        return ordered_nodes
119    def getModelManifest(
120        self,
121        title: str,
122        rootNode: str,
123        datasetId: str = None,
124        jsonSchema: str = None,
125        filenames: list = None,
126        useAnnotations: bool = False,
127        sheetUrl: bool = True,
128    ) -> str:
129        """Gets data from the annotations manifest file.
131        TBD: Does this method belong here or in manifest generator?
133        Args:
134            rootNode: a schema node label (i.e. term).
135            useAnnotations: whether to populate manifest with current file annotations (True) or not (False, default).
137        Returns:
138            A manifest URI (assume Google doc for now).
140        Raises:
141            ValueError: rootNode not found in metadata model.
142        """
143        additionalMetadata = {}
144        if filenames:
145            additionalMetadata["Filename"] = filenames
147        mg = ManifestGenerator(
148            path_to_json_ld=self.inputMModelLocation,
149            graph=self.graph_data_model,
150            title=title,
151            root=rootNode,
152            additional_metadata=additionalMetadata,
153            use_annotations=useAnnotations,
154        )
156        if datasetId:
157            return mg.get_manifest(
158                dataset_id=datasetId, json_schema=jsonSchema, sheet_url=sheetUrl
159            )
161        return mg.get_manifest(sheet_url=sheetUrl)
163    def get_component_requirements(
164        self, source_component: str, as_graph: bool = False
165    ) -> List:
166        """Given a source model component (see for definnition of component), return all components required by it.
167        Useful to construct requirement dependencies not only between specific attributes but also between categories/components of attributes;
168        Can be utilized to track metadata completion progress across multiple categories of attributes.
170        Args:
171            source_component: an attribute label indicating the source component.
172            as_graph: if False return component requirements as a list; if True return component requirements as a dependency graph (i.e. a DAG)
174        Returns:
175            A list of required components associated with the source component.
176        """
178        # get required components for the input/source component
179        req_components = self.dmge.get_component_requirements(source_component)
181        # retreive components as graph
182        if as_graph:
183            req_components_graph = self.dmge.get_component_requirements_graph(
184                source_component
185            )
187            # serialize component dependencies DAG to a edge list of node tuples
188            req_components = list(req_components_graph.edges())
190            return req_components
192        return req_components
194    # TODO: abstract validation in its own module
195    @tracer.start_as_current_span("MetadataModel::validateModelManifest")
196    def validateModelManifest(
197        self,
198        manifestPath: str,
199        rootNode: str,
200        restrict_rules: bool = False,
201        jsonSchema: Optional[str] = None,
202        project_scope: Optional[List] = None,
203        dataset_scope: Optional[str] = None,
204        access_token: Optional[str] = None,
205    ) -> tuple[list, list]:
206        """Check if provided annotations manifest dataframe satisfies all model requirements.
208        Args:
209            rootNode: a schema node label (i.e. term).
210            manifestPath: a path to the manifest csv file containing annotations.
211            restrict_rules: bypass great expectations and restrict rule options to those implemented in house
213        Returns:
214            A validation status message; if there is an error the message.
215            contains the manifest annotation record (i.e. row) that is invalid, along with the validation error associated with this record.
217        Raises:
218            ValueError: rootNode not found in metadata model.
219        """
220        # get validation schema for a given node in the data model, if the user has not provided input validation schema
222        if not jsonSchema:
223            # Instantiate Data Model Json Schema
224            self.data_model_js = DataModelJSONSchema(
225                jsonld_path=self.inputMModelLocation, graph=self.graph_data_model
226            )
228            jsonSchema = self.data_model_js.get_json_validation_schema(
229                rootNode, rootNode + "_validation"
230            )
232        errors = []
233        warnings = []
235        load_args = {
236            "dtype": "string",
237        }
238        # get annotations from manifest (array of json annotations corresponding to manifest rows)
239        manifest = load_df(
240            manifestPath,
241            preserve_raw_input=False,
242            allow_na_values=True,
243            **load_args,
244        )  # read manifest csv file as is from manifest path
246        # handler for mismatched components/data types
247        # throw TypeError if the value(s) in the "Component" column differ from the selected template type
248        if ("Component" in manifest.columns) and (
249            (len(manifest["Component"].unique()) > 1)
250            or (manifest["Component"].unique()[0] != rootNode)
251        ):
252            logging.error(
253                f"The 'Component' column value(s) {manifest['Component'].unique()} do not match the "
254                f"selected template type '{rootNode}'."
255            )
257            # row indexes for all rows where 'Component' is rootNode
258            row_idxs = manifest.index[manifest["Component"] != rootNode].tolist()
259            # column index value for the 'Component' column
260            col_idx = manifest.columns.get_loc("Component")
261            # Series with index and 'Component' values from manifest
262            mismatched_ser = manifest.iloc[row_idxs, col_idx]
263            for index, component in mismatched_ser.items():
264                errors.append(
265                    [
266                        index + 2,
267                        "Component",
268                        f"Component value provided is: '{component}', whereas the Template Type is: '{rootNode}'",
269                        # tuple of the component in the manifest and selected template type
270                        # check: R/Reticulate cannnot handle dicts? So returning tuple
271                        (component, rootNode),
272                    ]
273                )
275            return errors, warnings
277        errors, warnings, manifest = validate_all(
278            self,
279            errors=errors,
280            warnings=warnings,
281            manifest=manifest,
282            manifestPath=manifestPath,
283            dmge=self.dmge,
284            jsonSchema=jsonSchema,
285            restrict_rules=restrict_rules,
286            project_scope=project_scope,
287            dataset_scope=dataset_scope,
288            access_token=access_token,
289        )
290        return errors, warnings
292    def populateModelManifest(
293        self, title, manifestPath: str, rootNode: str, return_excel=False
294    ) -> str:
295        """Populate an existing annotations manifest based on a dataframe.
296            TODO: Remove this method; always use getModelManifest instead
298        Args:
299            rootNode: a schema node label (i.e. term).
300            manifestPath: a path to the manifest csv file containing annotations.
302        Returns:
303            A link to the filled in model manifest (e.g. google sheet).
305        Raises:
306            ValueError: rootNode not found in metadata model.
307        """
308        mg = ManifestGenerator(
309            path_to_data_model=self.inputMModelLocation,
310            graph=self.graph_data_model,
311            title=title,
312            root=rootNode,
313        )
315        emptyManifestURL = mg.get_manifest()
317        return mg.populate_manifest_spreadsheet(
318            manifestPath, emptyManifestURL, return_excel=return_excel, title=title
319        )
321    @tracer.start_as_current_span("MetadataModel::submit_metadata_manifest")
322    def submit_metadata_manifest(  # pylint: disable=too-many-arguments, too-many-locals
323        self,
324        manifest_path: str,
325        dataset_id: str,
326        manifest_record_type: str,
327        restrict_rules: bool,
328        access_token: Optional[str] = None,
329        validate_component: Optional[str] = None,
330        file_annotations_upload: bool = True,
331        hide_blanks: bool = False,
332        project_scope: Optional[list] = None,
333        dataset_scope: Optional[str] = None,
334        table_manipulation: str = "replace",
335        table_column_names: str = "class_label",
336        annotation_keys: str = "class_label",
337    ) -> str:
338        """
339        Wrap methods that are responsible for validation of manifests for a given component,
340          and association of the same manifest file with a specified dataset.
342        Args:
343            manifest_path (str): Path to the manifest file, which contains the metadata.
344            dataset_id (str): Synapse ID of the dataset on Synapse containing the
345              metadata manifest file.
346            manifest_record_type (str): How the manifest is stored in Synapse
347            restrict_rules (bool):
348              If True: bypass great expectations and restrict rule options to
349                those implemented in house
350            access_token (Optional[str], optional): Defaults to None.
351            validate_component (Optional[str], optional): Component from the
352              schema based on which the manifest template has been generated.
353            file_annotations_upload (bool, optional): Default to True. If false, do
354              not add annotations to files. Defaults to True.
355            hide_blanks (bool, optional): Defaults to False.
356            project_scope (Optional[list], optional): Defaults to None.
357            table_manipulation (str, optional): Defaults to "replace".
358            table_column_names (str, optional): Defaults to "class_label".
359            annotation_keys (str, optional): Defaults to "class_label".
361        Raises:
362            ValueError: When validate_component is provided, but it cannot be found in the schema.
363            ValidationError: If validation against data model was not successful.
365        Returns:
366            str: If both validation and association were successful.
367        """
368        # TODO: avoid explicitly exposing Synapse store functionality
369        # just instantiate a Store class and let it decide at runtime/config
370        # the store type
371        syn_store = SynapseStorage(
372            access_token=access_token, project_scope=project_scope
373        )
374        manifest_id = None
375        restrict_maniest = False
376        censored_manifest_path = manifest_path.replace(".csv", "_censored.csv")
377        # check if user wants to perform validation or not
378        if validate_component is not None:
379            try:
380                # check if the component ("class" in schema) passed as argument is valid
381                # (present in schema) or not
382                self.dmge.is_class_in_schema(validate_component)
383            except Exception as exc:
384                # a KeyError exception is raised when validate_component fails in the
385                # try-block above here, we are suppressing the KeyError exception and
386                # replacing it with a more descriptive ValueError exception
387                raise ValueError(
388                    f"The component '{validate_component}' could not be found "
389                    f"in the schema here '{self.path_to_json_ld}'"
390                ) from exc
392            # automatic JSON schema generation and validation with that JSON schema
393            val_errors, _ = self.validateModelManifest(
394                manifestPath=manifest_path,
395                rootNode=validate_component,
396                restrict_rules=restrict_rules,
397                project_scope=project_scope,
398                dataset_scope=dataset_scope,
399                access_token=access_token,
400            )
402            # if there are no errors in validation process
403            if val_errors == []:
404                # upload manifest file from `manifest_path` path to entity with Syn ID `dataset_id`
405                if os.path.exists(censored_manifest_path):
406                    syn_store.associateMetadataWithFiles(
407                        dmge=self.dmge,
408                        metadataManifestPath=censored_manifest_path,
409                        datasetId=dataset_id,
410                        manifest_record_type=manifest_record_type,
411                        hideBlanks=hide_blanks,
412                        table_manipulation=table_manipulation,
413                        table_column_names=table_column_names,
414                        annotation_keys=annotation_keys,
415                        file_annotations_upload=file_annotations_upload,
416                    )
417                    restrict_maniest = True
419                manifest_id = syn_store.associateMetadataWithFiles(
420                    dmge=self.dmge,
421                    metadataManifestPath=manifest_path,
422                    datasetId=dataset_id,
423                    manifest_record_type=manifest_record_type,
424                    hideBlanks=hide_blanks,
425                    restrict_manifest=restrict_maniest,
426                    table_manipulation=table_manipulation,
427                    table_column_names=table_column_names,
428                    annotation_keys=annotation_keys,
429                    file_annotations_upload=file_annotations_upload,
430                )
432      "No validation errors occured during validation.")
433                return manifest_id
435            else:
436                raise ValidationError(
437                    "Manifest could not be validated under provided data model. "
438                    f"Validation failed with the following errors: {val_errors}"
439                )
441        # no need to perform validation, just submit/associate the metadata manifest file
442        if os.path.exists(censored_manifest_path):
443            syn_store.associateMetadataWithFiles(
444                dmge=self.dmge,
445                metadataManifestPath=censored_manifest_path,
446                datasetId=dataset_id,
447                manifest_record_type=manifest_record_type,
448                hideBlanks=hide_blanks,
449                table_manipulation=table_manipulation,
450                table_column_names=table_column_names,
451                annotation_keys=annotation_keys,
452                file_annotations_upload=file_annotations_upload,
453            )
454            restrict_maniest = True
456        manifest_id = syn_store.associateMetadataWithFiles(
457            dmge=self.dmge,
458            metadataManifestPath=manifest_path,
459            datasetId=dataset_id,
460            manifest_record_type=manifest_record_type,
461            hideBlanks=hide_blanks,
462            restrict_manifest=restrict_maniest,
463            table_manipulation=table_manipulation,
464            table_column_names=table_column_names,
465            annotation_keys=annotation_keys,
466            file_annotations_upload=file_annotations_upload,
467        )
469        logger.debug(
470            "Optional validation was not performed on manifest before association."
471        )
473        return manifest_id
logger = <Logger schematic.models.metadata (WARNING)>
tracer = <opentelemetry.sdk.trace.Tracer object>
class MetadataModel:
 29class MetadataModel(object):
 30    """Metadata model wrapper around specification graph.
 32    Provides basic utilities to:
 34    1) manipulate the metadata model
 35    2) generate metadata model views:
 36        - generate manifest view of the metadata model
 37        - generate validation schema view of the metadata model
 38    """
 40    def __init__(
 41        self,
 42        inputMModelLocation: str,
 43        inputMModelLocationType: str,
 44        data_model_labels: str,
 45    ) -> None:
 46        """Instantiates a MetadataModel object.
 48        Args:
 49            inputMModelLocation: local path, uri, synapse entity id (e.g. gs://, syn123, /User/x/…); present location
 50            inputMModelLocationType: specifier to indicate where the metadata model resource can be found (e.g. 'local' if file/JSON-LD is on local machine)
 51        """
 52        # extract extension of 'inputMModelLocation'
 53        # ensure that it is necessarily pointing to a '.jsonld' file
 55        logger.debug(
 56            f"Initializing DataModelGraphExplorer object from {inputMModelLocation} schema."
 57        )
 59        # self.inputMModelLocation remains for backwards compatibility
 60        self.inputMModelLocation = inputMModelLocation
 61        self.path_to_json_ld = inputMModelLocation
 63        data_model_parser = DataModelParser(path_to_data_model=self.inputMModelLocation)
 64        # Parse Model
 65        parsed_data_model = data_model_parser.parse_model()
 67        # Instantiate DataModelGraph
 68        data_model_grapher = DataModelGraph(parsed_data_model, data_model_labels)
 70        # Generate graph
 71        self.graph_data_model = data_model_grapher.graph
 73        self.dmge = DataModelGraphExplorer(self.graph_data_model)
 75        # check if the type of MModel file is "local"
 76        # currently, the application only supports reading from local JSON-LD files
 77        if inputMModelLocationType == "local":
 78            self.inputMModelLocationType = inputMModelLocationType
 79        else:
 80            raise ValueError(
 81                f"The type '{inputMModelLocationType}' is currently not supported."
 82            )
 84    def getModelSubgraph(self, rootNode: str, subgraphType: str) -> nx.DiGraph:
 85        """Gets a schema subgraph from rootNode descendants based on edge/node properties of type subgraphType.
 87        Args:
 88            rootNode: a schema node label (i.e. term).
 89            subgraphType: the kind of subgraph to traverse (i.e. based on node properties or edge labels).
 91        Returns:
 92            A directed subgraph (networkx DiGraph) of the metadata model with vertex set root node descendants.
 94        Raises:
 95            ValueError: rootNode not found in metadata model.
 96        """
 97        pass
 99    def getOrderedModelNodes(self, rootNode: str, relationshipType: str) -> List[str]:
100        """Get a list of model objects ordered by their topological sort rank in a model subgraph on edges of a given relationship type.
102        Args:
103            rootNode: a schema object/node label (i.e. term)
104            relationshipType: edge label type of the schema subgraph (e.g. requiresDependency)
106        Returns:
107            An ordered list of objects, that are all descendants of rootNode.
109        Raises:
110            ValueError: rootNode not found in metadata model.
111        """
112        ordered_nodes = self.dmge.get_descendants_by_edge_type(
113            rootNode, relationshipType, connected=True, ordered=True
114        )
116        ordered_nodes.reverse()
118        return ordered_nodes
120    def getModelManifest(
121        self,
122        title: str,
123        rootNode: str,
124        datasetId: str = None,
125        jsonSchema: str = None,
126        filenames: list = None,
127        useAnnotations: bool = False,
128        sheetUrl: bool = True,
129    ) -> str:
130        """Gets data from the annotations manifest file.
132        TBD: Does this method belong here or in manifest generator?
134        Args:
135            rootNode: a schema node label (i.e. term).
136            useAnnotations: whether to populate manifest with current file annotations (True) or not (False, default).
138        Returns:
139            A manifest URI (assume Google doc for now).
141        Raises:
142            ValueError: rootNode not found in metadata model.
143        """
144        additionalMetadata = {}
145        if filenames:
146            additionalMetadata["Filename"] = filenames
148        mg = ManifestGenerator(
149            path_to_json_ld=self.inputMModelLocation,
150            graph=self.graph_data_model,
151            title=title,
152            root=rootNode,
153            additional_metadata=additionalMetadata,
154            use_annotations=useAnnotations,
155        )
157        if datasetId:
158            return mg.get_manifest(
159                dataset_id=datasetId, json_schema=jsonSchema, sheet_url=sheetUrl
160            )
162        return mg.get_manifest(sheet_url=sheetUrl)
164    def get_component_requirements(
165        self, source_component: str, as_graph: bool = False
166    ) -> List:
167        """Given a source model component (see for definnition of component), return all components required by it.
168        Useful to construct requirement dependencies not only between specific attributes but also between categories/components of attributes;
169        Can be utilized to track metadata completion progress across multiple categories of attributes.
171        Args:
172            source_component: an attribute label indicating the source component.
173            as_graph: if False return component requirements as a list; if True return component requirements as a dependency graph (i.e. a DAG)
175        Returns:
176            A list of required components associated with the source component.
177        """
179        # get required components for the input/source component
180        req_components = self.dmge.get_component_requirements(source_component)
182        # retreive components as graph
183        if as_graph:
184            req_components_graph = self.dmge.get_component_requirements_graph(
185                source_component
186            )
188            # serialize component dependencies DAG to a edge list of node tuples
189            req_components = list(req_components_graph.edges())
191            return req_components
193        return req_components
195    # TODO: abstract validation in its own module
196    @tracer.start_as_current_span("MetadataModel::validateModelManifest")
197    def validateModelManifest(
198        self,
199        manifestPath: str,
200        rootNode: str,
201        restrict_rules: bool = False,
202        jsonSchema: Optional[str] = None,
203        project_scope: Optional[List] = None,
204        dataset_scope: Optional[str] = None,
205        access_token: Optional[str] = None,
206    ) -> tuple[list, list]:
207        """Check if provided annotations manifest dataframe satisfies all model requirements.
209        Args:
210            rootNode: a schema node label (i.e. term).
211            manifestPath: a path to the manifest csv file containing annotations.
212            restrict_rules: bypass great expectations and restrict rule options to those implemented in house
214        Returns:
215            A validation status message; if there is an error the message.
216            contains the manifest annotation record (i.e. row) that is invalid, along with the validation error associated with this record.
218        Raises:
219            ValueError: rootNode not found in metadata model.
220        """
221        # get validation schema for a given node in the data model, if the user has not provided input validation schema
223        if not jsonSchema:
224            # Instantiate Data Model Json Schema
225            self.data_model_js = DataModelJSONSchema(
226                jsonld_path=self.inputMModelLocation, graph=self.graph_data_model
227            )
229            jsonSchema = self.data_model_js.get_json_validation_schema(
230                rootNode, rootNode + "_validation"
231            )
233        errors = []
234        warnings = []
236        load_args = {
237            "dtype": "string",
238        }
239        # get annotations from manifest (array of json annotations corresponding to manifest rows)
240        manifest = load_df(
241            manifestPath,
242            preserve_raw_input=False,
243            allow_na_values=True,
244            **load_args,
245        )  # read manifest csv file as is from manifest path
247        # handler for mismatched components/data types
248        # throw TypeError if the value(s) in the "Component" column differ from the selected template type
249        if ("Component" in manifest.columns) and (
250            (len(manifest["Component"].unique()) > 1)
251            or (manifest["Component"].unique()[0] != rootNode)
252        ):
253            logging.error(
254                f"The 'Component' column value(s) {manifest['Component'].unique()} do not match the "
255                f"selected template type '{rootNode}'."
256            )
258            # row indexes for all rows where 'Component' is rootNode
259            row_idxs = manifest.index[manifest["Component"] != rootNode].tolist()
260            # column index value for the 'Component' column
261            col_idx = manifest.columns.get_loc("Component")
262            # Series with index and 'Component' values from manifest
263            mismatched_ser = manifest.iloc[row_idxs, col_idx]
264            for index, component in mismatched_ser.items():
265                errors.append(
266                    [
267                        index + 2,
268                        "Component",
269                        f"Component value provided is: '{component}', whereas the Template Type is: '{rootNode}'",
270                        # tuple of the component in the manifest and selected template type
271                        # check: R/Reticulate cannnot handle dicts? So returning tuple
272                        (component, rootNode),
273                    ]
274                )
276            return errors, warnings
278        errors, warnings, manifest = validate_all(
279            self,
280            errors=errors,
281            warnings=warnings,
282            manifest=manifest,
283            manifestPath=manifestPath,
284            dmge=self.dmge,
285            jsonSchema=jsonSchema,
286            restrict_rules=restrict_rules,
287            project_scope=project_scope,
288            dataset_scope=dataset_scope,
289            access_token=access_token,
290        )
291        return errors, warnings
293    def populateModelManifest(
294        self, title, manifestPath: str, rootNode: str, return_excel=False
295    ) -> str:
296        """Populate an existing annotations manifest based on a dataframe.
297            TODO: Remove this method; always use getModelManifest instead
299        Args:
300            rootNode: a schema node label (i.e. term).
301            manifestPath: a path to the manifest csv file containing annotations.
303        Returns:
304            A link to the filled in model manifest (e.g. google sheet).
306        Raises:
307            ValueError: rootNode not found in metadata model.
308        """
309        mg = ManifestGenerator(
310            path_to_data_model=self.inputMModelLocation,
311            graph=self.graph_data_model,
312            title=title,
313            root=rootNode,
314        )
316        emptyManifestURL = mg.get_manifest()
318        return mg.populate_manifest_spreadsheet(
319            manifestPath, emptyManifestURL, return_excel=return_excel, title=title
320        )
322    @tracer.start_as_current_span("MetadataModel::submit_metadata_manifest")
323    def submit_metadata_manifest(  # pylint: disable=too-many-arguments, too-many-locals
324        self,
325        manifest_path: str,
326        dataset_id: str,
327        manifest_record_type: str,
328        restrict_rules: bool,
329        access_token: Optional[str] = None,
330        validate_component: Optional[str] = None,
331        file_annotations_upload: bool = True,
332        hide_blanks: bool = False,
333        project_scope: Optional[list] = None,
334        dataset_scope: Optional[str] = None,
335        table_manipulation: str = "replace",
336        table_column_names: str = "class_label",
337        annotation_keys: str = "class_label",
338    ) -> str:
339        """
340        Wrap methods that are responsible for validation of manifests for a given component,
341          and association of the same manifest file with a specified dataset.
343        Args:
344            manifest_path (str): Path to the manifest file, which contains the metadata.
345            dataset_id (str): Synapse ID of the dataset on Synapse containing the
346              metadata manifest file.
347            manifest_record_type (str): How the manifest is stored in Synapse
348            restrict_rules (bool):
349              If True: bypass great expectations and restrict rule options to
350                those implemented in house
351            access_token (Optional[str], optional): Defaults to None.
352            validate_component (Optional[str], optional): Component from the
353              schema based on which the manifest template has been generated.
354            file_annotations_upload (bool, optional): Default to True. If false, do
355              not add annotations to files. Defaults to True.
356            hide_blanks (bool, optional): Defaults to False.
357            project_scope (Optional[list], optional): Defaults to None.
358            table_manipulation (str, optional): Defaults to "replace".
359            table_column_names (str, optional): Defaults to "class_label".
360            annotation_keys (str, optional): Defaults to "class_label".
362        Raises:
363            ValueError: When validate_component is provided, but it cannot be found in the schema.
364            ValidationError: If validation against data model was not successful.
366        Returns:
367            str: If both validation and association were successful.
368        """
369        # TODO: avoid explicitly exposing Synapse store functionality
370        # just instantiate a Store class and let it decide at runtime/config
371        # the store type
372        syn_store = SynapseStorage(
373            access_token=access_token, project_scope=project_scope
374        )
375        manifest_id = None
376        restrict_maniest = False
377        censored_manifest_path = manifest_path.replace(".csv", "_censored.csv")
378        # check if user wants to perform validation or not
379        if validate_component is not None:
380            try:
381                # check if the component ("class" in schema) passed as argument is valid
382                # (present in schema) or not
383                self.dmge.is_class_in_schema(validate_component)
384            except Exception as exc:
385                # a KeyError exception is raised when validate_component fails in the
386                # try-block above here, we are suppressing the KeyError exception and
387                # replacing it with a more descriptive ValueError exception
388                raise ValueError(
389                    f"The component '{validate_component}' could not be found "
390                    f"in the schema here '{self.path_to_json_ld}'"
391                ) from exc
393            # automatic JSON schema generation and validation with that JSON schema
394            val_errors, _ = self.validateModelManifest(
395                manifestPath=manifest_path,
396                rootNode=validate_component,
397                restrict_rules=restrict_rules,
398                project_scope=project_scope,
399                dataset_scope=dataset_scope,
400                access_token=access_token,
401            )
403            # if there are no errors in validation process
404            if val_errors == []:
405                # upload manifest file from `manifest_path` path to entity with Syn ID `dataset_id`
406                if os.path.exists(censored_manifest_path):
407                    syn_store.associateMetadataWithFiles(
408                        dmge=self.dmge,
409                        metadataManifestPath=censored_manifest_path,
410                        datasetId=dataset_id,
411                        manifest_record_type=manifest_record_type,
412                        hideBlanks=hide_blanks,
413                        table_manipulation=table_manipulation,
414                        table_column_names=table_column_names,
415                        annotation_keys=annotation_keys,
416                        file_annotations_upload=file_annotations_upload,
417                    )
418                    restrict_maniest = True
420                manifest_id = syn_store.associateMetadataWithFiles(
421                    dmge=self.dmge,
422                    metadataManifestPath=manifest_path,
423                    datasetId=dataset_id,
424                    manifest_record_type=manifest_record_type,
425                    hideBlanks=hide_blanks,
426                    restrict_manifest=restrict_maniest,
427                    table_manipulation=table_manipulation,
428                    table_column_names=table_column_names,
429                    annotation_keys=annotation_keys,
430                    file_annotations_upload=file_annotations_upload,
431                )
433      "No validation errors occured during validation.")
434                return manifest_id
436            else:
437                raise ValidationError(
438                    "Manifest could not be validated under provided data model. "
439                    f"Validation failed with the following errors: {val_errors}"
440                )
442        # no need to perform validation, just submit/associate the metadata manifest file
443        if os.path.exists(censored_manifest_path):
444            syn_store.associateMetadataWithFiles(
445                dmge=self.dmge,
446                metadataManifestPath=censored_manifest_path,
447                datasetId=dataset_id,
448                manifest_record_type=manifest_record_type,
449                hideBlanks=hide_blanks,
450                table_manipulation=table_manipulation,
451                table_column_names=table_column_names,
452                annotation_keys=annotation_keys,
453                file_annotations_upload=file_annotations_upload,
454            )
455            restrict_maniest = True
457        manifest_id = syn_store.associateMetadataWithFiles(
458            dmge=self.dmge,
459            metadataManifestPath=manifest_path,
460            datasetId=dataset_id,
461            manifest_record_type=manifest_record_type,
462            hideBlanks=hide_blanks,
463            restrict_manifest=restrict_maniest,
464            table_manipulation=table_manipulation,
465            table_column_names=table_column_names,
466            annotation_keys=annotation_keys,
467            file_annotations_upload=file_annotations_upload,
468        )
470        logger.debug(
471            "Optional validation was not performed on manifest before association."
472        )
474        return manifest_id

Metadata model wrapper around specification graph.

Provides basic utilities to:

1) manipulate the metadata model 2) generate metadata model views: - generate manifest view of the metadata model - generate validation schema view of the metadata model

MetadataModel( inputMModelLocation: str, inputMModelLocationType: str, data_model_labels: str)
40    def __init__(
41        self,
42        inputMModelLocation: str,
43        inputMModelLocationType: str,
44        data_model_labels: str,
45    ) -> None:
46        """Instantiates a MetadataModel object.
48        Args:
49            inputMModelLocation: local path, uri, synapse entity id (e.g. gs://, syn123, /User/x/…); present location
50            inputMModelLocationType: specifier to indicate where the metadata model resource can be found (e.g. 'local' if file/JSON-LD is on local machine)
51        """
52        # extract extension of 'inputMModelLocation'
53        # ensure that it is necessarily pointing to a '.jsonld' file
55        logger.debug(
56            f"Initializing DataModelGraphExplorer object from {inputMModelLocation} schema."
57        )
59        # self.inputMModelLocation remains for backwards compatibility
60        self.inputMModelLocation = inputMModelLocation
61        self.path_to_json_ld = inputMModelLocation
63        data_model_parser = DataModelParser(path_to_data_model=self.inputMModelLocation)
64        # Parse Model
65        parsed_data_model = data_model_parser.parse_model()
67        # Instantiate DataModelGraph
68        data_model_grapher = DataModelGraph(parsed_data_model, data_model_labels)
70        # Generate graph
71        self.graph_data_model = data_model_grapher.graph
73        self.dmge = DataModelGraphExplorer(self.graph_data_model)
75        # check if the type of MModel file is "local"
76        # currently, the application only supports reading from local JSON-LD files
77        if inputMModelLocationType == "local":
78            self.inputMModelLocationType = inputMModelLocationType
79        else:
80            raise ValueError(
81                f"The type '{inputMModelLocationType}' is currently not supported."
82            )

Instantiates a MetadataModel object.

  • inputMModelLocation: local path, uri, synapse entity id (e.g. gs://, syn123, /User/x/…); present location
  • inputMModelLocationType: specifier to indicate where the metadata model resource can be found (e.g. 'local' if file/JSON-LD is on local machine)
def getModelSubgraph( self, rootNode: str, subgraphType: str) -> networkx.classes.digraph.DiGraph:
84    def getModelSubgraph(self, rootNode: str, subgraphType: str) -> nx.DiGraph:
85        """Gets a schema subgraph from rootNode descendants based on edge/node properties of type subgraphType.
87        Args:
88            rootNode: a schema node label (i.e. term).
89            subgraphType: the kind of subgraph to traverse (i.e. based on node properties or edge labels).
91        Returns:
92            A directed subgraph (networkx DiGraph) of the metadata model with vertex set root node descendants.
94        Raises:
95            ValueError: rootNode not found in metadata model.
96        """
97        pass

Gets a schema subgraph from rootNode descendants based on edge/node properties of type subgraphType.

  • rootNode: a schema node label (i.e. term).
  • subgraphType: the kind of subgraph to traverse (i.e. based on node properties or edge labels).

A directed subgraph (networkx DiGraph) of the metadata model with vertex set root node descendants.

  • ValueError: rootNode not found in metadata model.
def getOrderedModelNodes(self, rootNode: str, relationshipType: str) -> List[str]:
 99    def getOrderedModelNodes(self, rootNode: str, relationshipType: str) -> List[str]:
100        """Get a list of model objects ordered by their topological sort rank in a model subgraph on edges of a given relationship type.
102        Args:
103            rootNode: a schema object/node label (i.e. term)
104            relationshipType: edge label type of the schema subgraph (e.g. requiresDependency)
106        Returns:
107            An ordered list of objects, that are all descendants of rootNode.
109        Raises:
110            ValueError: rootNode not found in metadata model.
111        """
112        ordered_nodes = self.dmge.get_descendants_by_edge_type(
113            rootNode, relationshipType, connected=True, ordered=True
114        )
116        ordered_nodes.reverse()
118        return ordered_nodes

Get a list of model objects ordered by their topological sort rank in a model subgraph on edges of a given relationship type.

  • rootNode: a schema object/node label (i.e. term)
  • relationshipType: edge label type of the schema subgraph (e.g. requiresDependency)

An ordered list of objects, that are all descendants of rootNode.

  • ValueError: rootNode not found in metadata model.
def getModelManifest( self, title: str, rootNode: str, datasetId: str = None, jsonSchema: str = None, filenames: list = None, useAnnotations: bool = False, sheetUrl: bool = True) -> str:
120    def getModelManifest(
121        self,
122        title: str,
123        rootNode: str,
124        datasetId: str = None,
125        jsonSchema: str = None,
126        filenames: list = None,
127        useAnnotations: bool = False,
128        sheetUrl: bool = True,
129    ) -> str:
130        """Gets data from the annotations manifest file.
132        TBD: Does this method belong here or in manifest generator?
134        Args:
135            rootNode: a schema node label (i.e. term).
136            useAnnotations: whether to populate manifest with current file annotations (True) or not (False, default).
138        Returns:
139            A manifest URI (assume Google doc for now).
141        Raises:
142            ValueError: rootNode not found in metadata model.
143        """
144        additionalMetadata = {}
145        if filenames:
146            additionalMetadata["Filename"] = filenames
148        mg = ManifestGenerator(
149            path_to_json_ld=self.inputMModelLocation,
150            graph=self.graph_data_model,
151            title=title,
152            root=rootNode,
153            additional_metadata=additionalMetadata,
154            use_annotations=useAnnotations,
155        )
157        if datasetId:
158            return mg.get_manifest(
159                dataset_id=datasetId, json_schema=jsonSchema, sheet_url=sheetUrl
160            )
162        return mg.get_manifest(sheet_url=sheetUrl)

Gets data from the annotations manifest file.

TBD: Does this method belong here or in manifest generator?

  • rootNode: a schema node label (i.e. term).
  • useAnnotations: whether to populate manifest with current file annotations (True) or not (False, default).

A manifest URI (assume Google doc for now).

  • ValueError: rootNode not found in metadata model.
def get_component_requirements(self, source_component: str, as_graph: bool = False) -> List:
164    def get_component_requirements(
165        self, source_component: str, as_graph: bool = False
166    ) -> List:
167        """Given a source model component (see for definnition of component), return all components required by it.
168        Useful to construct requirement dependencies not only between specific attributes but also between categories/components of attributes;
169        Can be utilized to track metadata completion progress across multiple categories of attributes.
171        Args:
172            source_component: an attribute label indicating the source component.
173            as_graph: if False return component requirements as a list; if True return component requirements as a dependency graph (i.e. a DAG)
175        Returns:
176            A list of required components associated with the source component.
177        """
179        # get required components for the input/source component
180        req_components = self.dmge.get_component_requirements(source_component)
182        # retreive components as graph
183        if as_graph:
184            req_components_graph = self.dmge.get_component_requirements_graph(
185                source_component
186            )
188            # serialize component dependencies DAG to a edge list of node tuples
189            req_components = list(req_components_graph.edges())
191            return req_components
193        return req_components

Given a source model component (see for definnition of component), return all components required by it. Useful to construct requirement dependencies not only between specific attributes but also between categories/components of attributes; Can be utilized to track metadata completion progress across multiple categories of attributes.

  • source_component: an attribute label indicating the source component.
  • as_graph: if False return component requirements as a list; if True return component requirements as a dependency graph (i.e. a DAG)

A list of required components associated with the source component.

def validateModelManifest( self, manifestPath: str, rootNode: str, restrict_rules: bool = False, jsonSchema: Optional[str] = None, project_scope: Optional[List] = None, dataset_scope: Optional[str] = None, access_token: Optional[str] = None) -> tuple[list, list]:
196    @tracer.start_as_current_span("MetadataModel::validateModelManifest")
197    def validateModelManifest(
198        self,
199        manifestPath: str,
200        rootNode: str,
201        restrict_rules: bool = False,
202        jsonSchema: Optional[str] = None,
203        project_scope: Optional[List] = None,
204        dataset_scope: Optional[str] = None,
205        access_token: Optional[str] = None,
206    ) -> tuple[list, list]:
207        """Check if provided annotations manifest dataframe satisfies all model requirements.
209        Args:
210            rootNode: a schema node label (i.e. term).
211            manifestPath: a path to the manifest csv file containing annotations.
212            restrict_rules: bypass great expectations and restrict rule options to those implemented in house
214        Returns:
215            A validation status message; if there is an error the message.
216            contains the manifest annotation record (i.e. row) that is invalid, along with the validation error associated with this record.
218        Raises:
219            ValueError: rootNode not found in metadata model.
220        """
221        # get validation schema for a given node in the data model, if the user has not provided input validation schema
223        if not jsonSchema:
224            # Instantiate Data Model Json Schema
225            self.data_model_js = DataModelJSONSchema(
226                jsonld_path=self.inputMModelLocation, graph=self.graph_data_model
227            )
229            jsonSchema = self.data_model_js.get_json_validation_schema(
230                rootNode, rootNode + "_validation"
231            )
233        errors = []
234        warnings = []
236        load_args = {
237            "dtype": "string",
238        }
239        # get annotations from manifest (array of json annotations corresponding to manifest rows)
240        manifest = load_df(
241            manifestPath,
242            preserve_raw_input=False,
243            allow_na_values=True,
244            **load_args,
245        )  # read manifest csv file as is from manifest path
247        # handler for mismatched components/data types
248        # throw TypeError if the value(s) in the "Component" column differ from the selected template type
249        if ("Component" in manifest.columns) and (
250            (len(manifest["Component"].unique()) > 1)
251            or (manifest["Component"].unique()[0] != rootNode)
252        ):
253            logging.error(
254                f"The 'Component' column value(s) {manifest['Component'].unique()} do not match the "
255                f"selected template type '{rootNode}'."
256            )
258            # row indexes for all rows where 'Component' is rootNode
259            row_idxs = manifest.index[manifest["Component"] != rootNode].tolist()
260            # column index value for the 'Component' column
261            col_idx = manifest.columns.get_loc("Component")
262            # Series with index and 'Component' values from manifest
263            mismatched_ser = manifest.iloc[row_idxs, col_idx]
264            for index, component in mismatched_ser.items():
265                errors.append(
266                    [
267                        index + 2,
268                        "Component",
269                        f"Component value provided is: '{component}', whereas the Template Type is: '{rootNode}'",
270                        # tuple of the component in the manifest and selected template type
271                        # check: R/Reticulate cannnot handle dicts? So returning tuple
272                        (component, rootNode),
273                    ]
274                )
276            return errors, warnings
278        errors, warnings, manifest = validate_all(
279            self,
280            errors=errors,
281            warnings=warnings,
282            manifest=manifest,
283            manifestPath=manifestPath,
284            dmge=self.dmge,
285            jsonSchema=jsonSchema,
286            restrict_rules=restrict_rules,
287            project_scope=project_scope,
288            dataset_scope=dataset_scope,
289            access_token=access_token,
290        )
291        return errors, warnings

Check if provided annotations manifest dataframe satisfies all model requirements.

  • rootNode: a schema node label (i.e. term).
  • manifestPath: a path to the manifest csv file containing annotations.
  • restrict_rules: bypass great expectations and restrict rule options to those implemented in house

A validation status message; if there is an error the message. contains the manifest annotation record (i.e. row) that is invalid, along with the validation error associated with this record.

  • ValueError: rootNode not found in metadata model.
def populateModelManifest(self, title, manifestPath: str, rootNode: str, return_excel=False) -> str:
293    def populateModelManifest(
294        self, title, manifestPath: str, rootNode: str, return_excel=False
295    ) -> str:
296        """Populate an existing annotations manifest based on a dataframe.
297            TODO: Remove this method; always use getModelManifest instead
299        Args:
300            rootNode: a schema node label (i.e. term).
301            manifestPath: a path to the manifest csv file containing annotations.
303        Returns:
304            A link to the filled in model manifest (e.g. google sheet).
306        Raises:
307            ValueError: rootNode not found in metadata model.
308        """
309        mg = ManifestGenerator(
310            path_to_data_model=self.inputMModelLocation,
311            graph=self.graph_data_model,
312            title=title,
313            root=rootNode,
314        )
316        emptyManifestURL = mg.get_manifest()
318        return mg.populate_manifest_spreadsheet(
319            manifestPath, emptyManifestURL, return_excel=return_excel, title=title
320        )

Populate an existing annotations manifest based on a dataframe. TODO: Remove this method; always use getModelManifest instead

  • rootNode: a schema node label (i.e. term).
  • manifestPath: a path to the manifest csv file containing annotations.

A link to the filled in model manifest (e.g. google sheet).

  • ValueError: rootNode not found in metadata model.
def submit_metadata_manifest( self, manifest_path: str, dataset_id: str, manifest_record_type: str, restrict_rules: bool, access_token: Optional[str] = None, validate_component: Optional[str] = None, file_annotations_upload: bool = True, hide_blanks: bool = False, project_scope: Optional[list] = None, dataset_scope: Optional[str] = None, table_manipulation: str = 'replace', table_column_names: str = 'class_label', annotation_keys: str = 'class_label') -> str:
322    @tracer.start_as_current_span("MetadataModel::submit_metadata_manifest")
323    def submit_metadata_manifest(  # pylint: disable=too-many-arguments, too-many-locals
324        self,
325        manifest_path: str,
326        dataset_id: str,
327        manifest_record_type: str,
328        restrict_rules: bool,
329        access_token: Optional[str] = None,
330        validate_component: Optional[str] = None,
331        file_annotations_upload: bool = True,
332        hide_blanks: bool = False,
333        project_scope: Optional[list] = None,
334        dataset_scope: Optional[str] = None,
335        table_manipulation: str = "replace",
336        table_column_names: str = "class_label",
337        annotation_keys: str = "class_label",
338    ) -> str:
339        """
340        Wrap methods that are responsible for validation of manifests for a given component,
341          and association of the same manifest file with a specified dataset.
343        Args:
344            manifest_path (str): Path to the manifest file, which contains the metadata.
345            dataset_id (str): Synapse ID of the dataset on Synapse containing the
346              metadata manifest file.
347            manifest_record_type (str): How the manifest is stored in Synapse
348            restrict_rules (bool):
349              If True: bypass great expectations and restrict rule options to
350                those implemented in house
351            access_token (Optional[str], optional): Defaults to None.
352            validate_component (Optional[str], optional): Component from the
353              schema based on which the manifest template has been generated.
354            file_annotations_upload (bool, optional): Default to True. If false, do
355              not add annotations to files. Defaults to True.
356            hide_blanks (bool, optional): Defaults to False.
357            project_scope (Optional[list], optional): Defaults to None.
358            table_manipulation (str, optional): Defaults to "replace".
359            table_column_names (str, optional): Defaults to "class_label".
360            annotation_keys (str, optional): Defaults to "class_label".
362        Raises:
363            ValueError: When validate_component is provided, but it cannot be found in the schema.
364            ValidationError: If validation against data model was not successful.
366        Returns:
367            str: If both validation and association were successful.
368        """
369        # TODO: avoid explicitly exposing Synapse store functionality
370        # just instantiate a Store class and let it decide at runtime/config
371        # the store type
372        syn_store = SynapseStorage(
373            access_token=access_token, project_scope=project_scope
374        )
375        manifest_id = None
376        restrict_maniest = False
377        censored_manifest_path = manifest_path.replace(".csv", "_censored.csv")
378        # check if user wants to perform validation or not
379        if validate_component is not None:
380            try:
381                # check if the component ("class" in schema) passed as argument is valid
382                # (present in schema) or not
383                self.dmge.is_class_in_schema(validate_component)
384            except Exception as exc:
385                # a KeyError exception is raised when validate_component fails in the
386                # try-block above here, we are suppressing the KeyError exception and
387                # replacing it with a more descriptive ValueError exception
388                raise ValueError(
389                    f"The component '{validate_component}' could not be found "
390                    f"in the schema here '{self.path_to_json_ld}'"
391                ) from exc
393            # automatic JSON schema generation and validation with that JSON schema
394            val_errors, _ = self.validateModelManifest(
395                manifestPath=manifest_path,
396                rootNode=validate_component,
397                restrict_rules=restrict_rules,
398                project_scope=project_scope,
399                dataset_scope=dataset_scope,
400                access_token=access_token,
401            )
403            # if there are no errors in validation process
404            if val_errors == []:
405                # upload manifest file from `manifest_path` path to entity with Syn ID `dataset_id`
406                if os.path.exists(censored_manifest_path):
407                    syn_store.associateMetadataWithFiles(
408                        dmge=self.dmge,
409                        metadataManifestPath=censored_manifest_path,
410                        datasetId=dataset_id,
411                        manifest_record_type=manifest_record_type,
412                        hideBlanks=hide_blanks,
413                        table_manipulation=table_manipulation,
414                        table_column_names=table_column_names,
415                        annotation_keys=annotation_keys,
416                        file_annotations_upload=file_annotations_upload,
417                    )
418                    restrict_maniest = True
420                manifest_id = syn_store.associateMetadataWithFiles(
421                    dmge=self.dmge,
422                    metadataManifestPath=manifest_path,
423                    datasetId=dataset_id,
424                    manifest_record_type=manifest_record_type,
425                    hideBlanks=hide_blanks,
426                    restrict_manifest=restrict_maniest,
427                    table_manipulation=table_manipulation,
428                    table_column_names=table_column_names,
429                    annotation_keys=annotation_keys,
430                    file_annotations_upload=file_annotations_upload,
431                )
433      "No validation errors occured during validation.")
434                return manifest_id
436            else:
437                raise ValidationError(
438                    "Manifest could not be validated under provided data model. "
439                    f"Validation failed with the following errors: {val_errors}"
440                )
442        # no need to perform validation, just submit/associate the metadata manifest file
443        if os.path.exists(censored_manifest_path):
444            syn_store.associateMetadataWithFiles(
445                dmge=self.dmge,
446                metadataManifestPath=censored_manifest_path,
447                datasetId=dataset_id,
448                manifest_record_type=manifest_record_type,
449                hideBlanks=hide_blanks,
450                table_manipulation=table_manipulation,
451                table_column_names=table_column_names,
452                annotation_keys=annotation_keys,
453                file_annotations_upload=file_annotations_upload,
454            )
455            restrict_maniest = True
457        manifest_id = syn_store.associateMetadataWithFiles(
458            dmge=self.dmge,
459            metadataManifestPath=manifest_path,
460            datasetId=dataset_id,
461            manifest_record_type=manifest_record_type,
462            hideBlanks=hide_blanks,
463            restrict_manifest=restrict_maniest,
464            table_manipulation=table_manipulation,
465            table_column_names=table_column_names,
466            annotation_keys=annotation_keys,
467            file_annotations_upload=file_annotations_upload,
468        )
470        logger.debug(
471            "Optional validation was not performed on manifest before association."
472        )
474        return manifest_id

Wrap methods that are responsible for validation of manifests for a given component, and association of the same manifest file with a specified dataset.

  • manifest_path (str): Path to the manifest file, which contains the metadata.
  • dataset_id (str): Synapse ID of the dataset on Synapse containing the metadata manifest file.
  • manifest_record_type (str): How the manifest is stored in Synapse
  • restrict_rules (bool): If True: bypass great expectations and restrict rule options to those implemented in house
  • access_token (Optional[str], optional): Defaults to None.
  • validate_component (Optional[str], optional): Component from the schema based on which the manifest template has been generated.
  • file_annotations_upload (bool, optional): Default to True. If false, do not add annotations to files. Defaults to True.
  • hide_blanks (bool, optional): Defaults to False.
  • project_scope (Optional[list], optional): Defaults to None.
  • table_manipulation (str, optional): Defaults to "replace".
  • table_column_names (str, optional): Defaults to "class_label".
  • annotation_keys (str, optional): Defaults to "class_label".
  • ValueError: When validate_component is provided, but it cannot be found in the schema.
  • ValidationError: If validation against data model was not successful.

str: If both validation and association were successful.