schematic.models.metadata
1import logging 2import os 3from os.path import exists 4 5# allows specifying explicit variable types 6from typing import Any, Dict, List, Optional, Text 7 8import networkx as nx 9from jsonschema import ValidationError 10from opentelemetry import trace 11 12from schematic.manifest.generator import ManifestGenerator 13from schematic.models.validate_manifest import validate_all 14from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer 15from schematic.schemas.data_model_json_schema import DataModelJSONSchema 16from schematic.schemas.data_model_parser import DataModelParser 17 18# TODO: This module should only be aware of the store interface 19# we shouldn't need to expose Synapse functionality explicitly 20from schematic.store.synapse import SynapseStorage 21from schematic.utils.df_utils import load_df 22 23logger = logging.getLogger(__name__) 24 25tracer = trace.get_tracer("Schematic") 26 27 28class MetadataModel(object): 29 """Metadata model wrapper around schema.org specification graph. 30 31 Provides basic utilities to: 32 33 1) manipulate the metadata model 34 2) generate metadata model views: 35 - generate manifest view of the metadata model 36 - generate validation schema view of the metadata model 37 """ 38 39 def __init__( 40 self, 41 inputMModelLocation: str, 42 inputMModelLocationType: str, 43 data_model_labels: str, 44 ) -> None: 45 """Instantiates a MetadataModel object. 46 47 Args: 48 inputMModelLocation: local path, uri, synapse entity id (e.g. gs://, syn123, /User/x/…); present location 49 inputMModelLocationType: specifier to indicate where the metadata model resource can be found (e.g. 'local' if file/JSON-LD is on local machine) 50 """ 51 # extract extension of 'inputMModelLocation' 52 # ensure that it is necessarily pointing to a '.jsonld' file 53 54 logger.debug( 55 f"Initializing DataModelGraphExplorer object from {inputMModelLocation} schema." 56 ) 57 58 # self.inputMModelLocation remains for backwards compatibility 59 self.inputMModelLocation = inputMModelLocation 60 self.path_to_json_ld = inputMModelLocation 61 62 data_model_parser = DataModelParser(path_to_data_model=self.inputMModelLocation) 63 # Parse Model 64 parsed_data_model = data_model_parser.parse_model() 65 66 # Instantiate DataModelGraph 67 data_model_grapher = DataModelGraph(parsed_data_model, data_model_labels) 68 69 # Generate graph 70 self.graph_data_model = data_model_grapher.graph 71 72 self.dmge = DataModelGraphExplorer(self.graph_data_model) 73 74 # check if the type of MModel file is "local" 75 # currently, the application only supports reading from local JSON-LD files 76 if inputMModelLocationType == "local": 77 self.inputMModelLocationType = inputMModelLocationType 78 else: 79 raise ValueError( 80 f"The type '{inputMModelLocationType}' is currently not supported." 81 ) 82 83 def getModelSubgraph(self, rootNode: str, subgraphType: str) -> nx.DiGraph: 84 """Gets a schema subgraph from rootNode descendants based on edge/node properties of type subgraphType. 85 86 Args: 87 rootNode: a schema node label (i.e. term). 88 subgraphType: the kind of subgraph to traverse (i.e. based on node properties or edge labels). 89 90 Returns: 91 A directed subgraph (networkx DiGraph) of the metadata model with vertex set root node descendants. 92 93 Raises: 94 ValueError: rootNode not found in metadata model. 95 """ 96 pass 97 98 def getOrderedModelNodes(self, rootNode: str, relationshipType: str) -> List[str]: 99 """Get a list of model objects ordered by their topological sort rank in a model subgraph on edges of a given relationship type. 100 101 Args: 102 rootNode: a schema object/node label (i.e. term) 103 relationshipType: edge label type of the schema subgraph (e.g. requiresDependency) 104 105 Returns: 106 An ordered list of objects, that are all descendants of rootNode. 107 108 Raises: 109 ValueError: rootNode not found in metadata model. 110 """ 111 ordered_nodes = self.dmge.get_descendants_by_edge_type( 112 rootNode, relationshipType, connected=True, ordered=True 113 ) 114 115 ordered_nodes.reverse() 116 117 return ordered_nodes 118 119 def getModelManifest( 120 self, 121 title: str, 122 rootNode: str, 123 datasetId: str = None, 124 jsonSchema: str = None, 125 filenames: list = None, 126 useAnnotations: bool = False, 127 sheetUrl: bool = True, 128 ) -> str: 129 """Gets data from the annotations manifest file. 130 131 TBD: Does this method belong here or in manifest generator? 132 133 Args: 134 rootNode: a schema node label (i.e. term). 135 useAnnotations: whether to populate manifest with current file annotations (True) or not (False, default). 136 137 Returns: 138 A manifest URI (assume Google doc for now). 139 140 Raises: 141 ValueError: rootNode not found in metadata model. 142 """ 143 additionalMetadata = {} 144 if filenames: 145 additionalMetadata["Filename"] = filenames 146 147 mg = ManifestGenerator( 148 path_to_json_ld=self.inputMModelLocation, 149 graph=self.graph_data_model, 150 title=title, 151 root=rootNode, 152 additional_metadata=additionalMetadata, 153 use_annotations=useAnnotations, 154 ) 155 156 if datasetId: 157 return mg.get_manifest( 158 dataset_id=datasetId, json_schema=jsonSchema, sheet_url=sheetUrl 159 ) 160 161 return mg.get_manifest(sheet_url=sheetUrl) 162 163 def get_component_requirements( 164 self, source_component: str, as_graph: bool = False 165 ) -> List: 166 """Given a source model component (see https://w3id.org/biolink/vocab/category for definnition of component), return all components required by it. 167 Useful to construct requirement dependencies not only between specific attributes but also between categories/components of attributes; 168 Can be utilized to track metadata completion progress across multiple categories of attributes. 169 170 Args: 171 source_component: an attribute label indicating the source component. 172 as_graph: if False return component requirements as a list; if True return component requirements as a dependency graph (i.e. a DAG) 173 174 Returns: 175 A list of required components associated with the source component. 176 """ 177 178 # get required components for the input/source component 179 req_components = self.dmge.get_component_requirements(source_component) 180 181 # retreive components as graph 182 if as_graph: 183 req_components_graph = self.dmge.get_component_requirements_graph( 184 source_component 185 ) 186 187 # serialize component dependencies DAG to a edge list of node tuples 188 req_components = list(req_components_graph.edges()) 189 190 return req_components 191 192 return req_components 193 194 # TODO: abstract validation in its own module 195 @tracer.start_as_current_span("MetadataModel::validateModelManifest") 196 def validateModelManifest( 197 self, 198 manifestPath: str, 199 rootNode: str, 200 restrict_rules: bool = False, 201 jsonSchema: Optional[str] = None, 202 project_scope: Optional[List] = None, 203 dataset_scope: Optional[str] = None, 204 access_token: Optional[str] = None, 205 ) -> tuple[list, list]: 206 """Check if provided annotations manifest dataframe satisfies all model requirements. 207 208 Args: 209 rootNode: a schema node label (i.e. term). 210 manifestPath: a path to the manifest csv file containing annotations. 211 restrict_rules: bypass great expectations and restrict rule options to those implemented in house 212 213 Returns: 214 A validation status message; if there is an error the message. 215 contains the manifest annotation record (i.e. row) that is invalid, along with the validation error associated with this record. 216 217 Raises: 218 ValueError: rootNode not found in metadata model. 219 """ 220 # get validation schema for a given node in the data model, if the user has not provided input validation schema 221 222 if not jsonSchema: 223 # Instantiate Data Model Json Schema 224 self.data_model_js = DataModelJSONSchema( 225 jsonld_path=self.inputMModelLocation, graph=self.graph_data_model 226 ) 227 228 jsonSchema = self.data_model_js.get_json_validation_schema( 229 rootNode, rootNode + "_validation" 230 ) 231 232 errors = [] 233 warnings = [] 234 235 load_args = { 236 "dtype": "string", 237 } 238 # get annotations from manifest (array of json annotations corresponding to manifest rows) 239 manifest = load_df( 240 manifestPath, 241 preserve_raw_input=False, 242 allow_na_values=True, 243 **load_args, 244 ) # read manifest csv file as is from manifest path 245 246 # handler for mismatched components/data types 247 # throw TypeError if the value(s) in the "Component" column differ from the selected template type 248 if ("Component" in manifest.columns) and ( 249 (len(manifest["Component"].unique()) > 1) 250 or (manifest["Component"].unique()[0] != rootNode) 251 ): 252 logging.error( 253 f"The 'Component' column value(s) {manifest['Component'].unique()} do not match the " 254 f"selected template type '{rootNode}'." 255 ) 256 257 # row indexes for all rows where 'Component' is rootNode 258 row_idxs = manifest.index[manifest["Component"] != rootNode].tolist() 259 # column index value for the 'Component' column 260 col_idx = manifest.columns.get_loc("Component") 261 # Series with index and 'Component' values from manifest 262 mismatched_ser = manifest.iloc[row_idxs, col_idx] 263 for index, component in mismatched_ser.items(): 264 errors.append( 265 [ 266 index + 2, 267 "Component", 268 f"Component value provided is: '{component}', whereas the Template Type is: '{rootNode}'", 269 # tuple of the component in the manifest and selected template type 270 # check: R/Reticulate cannnot handle dicts? So returning tuple 271 (component, rootNode), 272 ] 273 ) 274 275 return errors, warnings 276 277 errors, warnings, manifest = validate_all( 278 self, 279 errors=errors, 280 warnings=warnings, 281 manifest=manifest, 282 manifestPath=manifestPath, 283 dmge=self.dmge, 284 jsonSchema=jsonSchema, 285 restrict_rules=restrict_rules, 286 project_scope=project_scope, 287 dataset_scope=dataset_scope, 288 access_token=access_token, 289 ) 290 return errors, warnings 291 292 def populateModelManifest( 293 self, title, manifestPath: str, rootNode: str, return_excel=False 294 ) -> str: 295 """Populate an existing annotations manifest based on a dataframe. 296 TODO: Remove this method; always use getModelManifest instead 297 298 Args: 299 rootNode: a schema node label (i.e. term). 300 manifestPath: a path to the manifest csv file containing annotations. 301 302 Returns: 303 A link to the filled in model manifest (e.g. google sheet). 304 305 Raises: 306 ValueError: rootNode not found in metadata model. 307 """ 308 mg = ManifestGenerator( 309 path_to_data_model=self.inputMModelLocation, 310 graph=self.graph_data_model, 311 title=title, 312 root=rootNode, 313 ) 314 315 emptyManifestURL = mg.get_manifest() 316 317 return mg.populate_manifest_spreadsheet( 318 manifestPath, emptyManifestURL, return_excel=return_excel, title=title 319 ) 320 321 @tracer.start_as_current_span("MetadataModel::submit_metadata_manifest") 322 def submit_metadata_manifest( # pylint: disable=too-many-arguments, too-many-locals 323 self, 324 manifest_path: str, 325 dataset_id: str, 326 manifest_record_type: str, 327 restrict_rules: bool, 328 access_token: Optional[str] = None, 329 validate_component: Optional[str] = None, 330 file_annotations_upload: bool = True, 331 hide_blanks: bool = False, 332 project_scope: Optional[list] = None, 333 dataset_scope: Optional[str] = None, 334 table_manipulation: str = "replace", 335 table_column_names: str = "class_label", 336 annotation_keys: str = "class_label", 337 ) -> str: 338 """ 339 Wrap methods that are responsible for validation of manifests for a given component, 340 and association of the same manifest file with a specified dataset. 341 342 Args: 343 manifest_path (str): Path to the manifest file, which contains the metadata. 344 dataset_id (str): Synapse ID of the dataset on Synapse containing the 345 metadata manifest file. 346 manifest_record_type (str): How the manifest is stored in Synapse 347 restrict_rules (bool): 348 If True: bypass great expectations and restrict rule options to 349 those implemented in house 350 access_token (Optional[str], optional): Defaults to None. 351 validate_component (Optional[str], optional): Component from the schema.org 352 schema based on which the manifest template has been generated. 353 file_annotations_upload (bool, optional): Default to True. If false, do 354 not add annotations to files. Defaults to True. 355 hide_blanks (bool, optional): Defaults to False. 356 project_scope (Optional[list], optional): Defaults to None. 357 table_manipulation (str, optional): Defaults to "replace". 358 table_column_names (str, optional): Defaults to "class_label". 359 annotation_keys (str, optional): Defaults to "class_label". 360 361 Raises: 362 ValueError: When validate_component is provided, but it cannot be found in the schema. 363 ValidationError: If validation against data model was not successful. 364 365 Returns: 366 str: If both validation and association were successful. 367 """ 368 # TODO: avoid explicitly exposing Synapse store functionality 369 # just instantiate a Store class and let it decide at runtime/config 370 # the store type 371 syn_store = SynapseStorage( 372 access_token=access_token, project_scope=project_scope 373 ) 374 manifest_id = None 375 restrict_maniest = False 376 censored_manifest_path = manifest_path.replace(".csv", "_censored.csv") 377 # check if user wants to perform validation or not 378 if validate_component is not None: 379 try: 380 # check if the component ("class" in schema) passed as argument is valid 381 # (present in schema) or not 382 self.dmge.is_class_in_schema(validate_component) 383 except Exception as exc: 384 # a KeyError exception is raised when validate_component fails in the 385 # try-block above here, we are suppressing the KeyError exception and 386 # replacing it with a more descriptive ValueError exception 387 raise ValueError( 388 f"The component '{validate_component}' could not be found " 389 f"in the schema here '{self.path_to_json_ld}'" 390 ) from exc 391 392 # automatic JSON schema generation and validation with that JSON schema 393 val_errors, _ = self.validateModelManifest( 394 manifestPath=manifest_path, 395 rootNode=validate_component, 396 restrict_rules=restrict_rules, 397 project_scope=project_scope, 398 dataset_scope=dataset_scope, 399 access_token=access_token, 400 ) 401 402 # if there are no errors in validation process 403 if val_errors == []: 404 # upload manifest file from `manifest_path` path to entity with Syn ID `dataset_id` 405 if os.path.exists(censored_manifest_path): 406 syn_store.associateMetadataWithFiles( 407 dmge=self.dmge, 408 metadataManifestPath=censored_manifest_path, 409 datasetId=dataset_id, 410 manifest_record_type=manifest_record_type, 411 hideBlanks=hide_blanks, 412 table_manipulation=table_manipulation, 413 table_column_names=table_column_names, 414 annotation_keys=annotation_keys, 415 file_annotations_upload=file_annotations_upload, 416 ) 417 restrict_maniest = True 418 419 manifest_id = syn_store.associateMetadataWithFiles( 420 dmge=self.dmge, 421 metadataManifestPath=manifest_path, 422 datasetId=dataset_id, 423 manifest_record_type=manifest_record_type, 424 hideBlanks=hide_blanks, 425 restrict_manifest=restrict_maniest, 426 table_manipulation=table_manipulation, 427 table_column_names=table_column_names, 428 annotation_keys=annotation_keys, 429 file_annotations_upload=file_annotations_upload, 430 ) 431 432 logger.info("No validation errors occured during validation.") 433 return manifest_id 434 435 else: 436 raise ValidationError( 437 "Manifest could not be validated under provided data model. " 438 f"Validation failed with the following errors: {val_errors}" 439 ) 440 441 # no need to perform validation, just submit/associate the metadata manifest file 442 if os.path.exists(censored_manifest_path): 443 syn_store.associateMetadataWithFiles( 444 dmge=self.dmge, 445 metadataManifestPath=censored_manifest_path, 446 datasetId=dataset_id, 447 manifest_record_type=manifest_record_type, 448 hideBlanks=hide_blanks, 449 table_manipulation=table_manipulation, 450 table_column_names=table_column_names, 451 annotation_keys=annotation_keys, 452 file_annotations_upload=file_annotations_upload, 453 ) 454 restrict_maniest = True 455 456 manifest_id = syn_store.associateMetadataWithFiles( 457 dmge=self.dmge, 458 metadataManifestPath=manifest_path, 459 datasetId=dataset_id, 460 manifest_record_type=manifest_record_type, 461 hideBlanks=hide_blanks, 462 restrict_manifest=restrict_maniest, 463 table_manipulation=table_manipulation, 464 table_column_names=table_column_names, 465 annotation_keys=annotation_keys, 466 file_annotations_upload=file_annotations_upload, 467 ) 468 469 logger.debug( 470 "Optional validation was not performed on manifest before association." 471 ) 472 473 return manifest_id
29class MetadataModel(object): 30 """Metadata model wrapper around schema.org specification graph. 31 32 Provides basic utilities to: 33 34 1) manipulate the metadata model 35 2) generate metadata model views: 36 - generate manifest view of the metadata model 37 - generate validation schema view of the metadata model 38 """ 39 40 def __init__( 41 self, 42 inputMModelLocation: str, 43 inputMModelLocationType: str, 44 data_model_labels: str, 45 ) -> None: 46 """Instantiates a MetadataModel object. 47 48 Args: 49 inputMModelLocation: local path, uri, synapse entity id (e.g. gs://, syn123, /User/x/…); present location 50 inputMModelLocationType: specifier to indicate where the metadata model resource can be found (e.g. 'local' if file/JSON-LD is on local machine) 51 """ 52 # extract extension of 'inputMModelLocation' 53 # ensure that it is necessarily pointing to a '.jsonld' file 54 55 logger.debug( 56 f"Initializing DataModelGraphExplorer object from {inputMModelLocation} schema." 57 ) 58 59 # self.inputMModelLocation remains for backwards compatibility 60 self.inputMModelLocation = inputMModelLocation 61 self.path_to_json_ld = inputMModelLocation 62 63 data_model_parser = DataModelParser(path_to_data_model=self.inputMModelLocation) 64 # Parse Model 65 parsed_data_model = data_model_parser.parse_model() 66 67 # Instantiate DataModelGraph 68 data_model_grapher = DataModelGraph(parsed_data_model, data_model_labels) 69 70 # Generate graph 71 self.graph_data_model = data_model_grapher.graph 72 73 self.dmge = DataModelGraphExplorer(self.graph_data_model) 74 75 # check if the type of MModel file is "local" 76 # currently, the application only supports reading from local JSON-LD files 77 if inputMModelLocationType == "local": 78 self.inputMModelLocationType = inputMModelLocationType 79 else: 80 raise ValueError( 81 f"The type '{inputMModelLocationType}' is currently not supported." 82 ) 83 84 def getModelSubgraph(self, rootNode: str, subgraphType: str) -> nx.DiGraph: 85 """Gets a schema subgraph from rootNode descendants based on edge/node properties of type subgraphType. 86 87 Args: 88 rootNode: a schema node label (i.e. term). 89 subgraphType: the kind of subgraph to traverse (i.e. based on node properties or edge labels). 90 91 Returns: 92 A directed subgraph (networkx DiGraph) of the metadata model with vertex set root node descendants. 93 94 Raises: 95 ValueError: rootNode not found in metadata model. 96 """ 97 pass 98 99 def getOrderedModelNodes(self, rootNode: str, relationshipType: str) -> List[str]: 100 """Get a list of model objects ordered by their topological sort rank in a model subgraph on edges of a given relationship type. 101 102 Args: 103 rootNode: a schema object/node label (i.e. term) 104 relationshipType: edge label type of the schema subgraph (e.g. requiresDependency) 105 106 Returns: 107 An ordered list of objects, that are all descendants of rootNode. 108 109 Raises: 110 ValueError: rootNode not found in metadata model. 111 """ 112 ordered_nodes = self.dmge.get_descendants_by_edge_type( 113 rootNode, relationshipType, connected=True, ordered=True 114 ) 115 116 ordered_nodes.reverse() 117 118 return ordered_nodes 119 120 def getModelManifest( 121 self, 122 title: str, 123 rootNode: str, 124 datasetId: str = None, 125 jsonSchema: str = None, 126 filenames: list = None, 127 useAnnotations: bool = False, 128 sheetUrl: bool = True, 129 ) -> str: 130 """Gets data from the annotations manifest file. 131 132 TBD: Does this method belong here or in manifest generator? 133 134 Args: 135 rootNode: a schema node label (i.e. term). 136 useAnnotations: whether to populate manifest with current file annotations (True) or not (False, default). 137 138 Returns: 139 A manifest URI (assume Google doc for now). 140 141 Raises: 142 ValueError: rootNode not found in metadata model. 143 """ 144 additionalMetadata = {} 145 if filenames: 146 additionalMetadata["Filename"] = filenames 147 148 mg = ManifestGenerator( 149 path_to_json_ld=self.inputMModelLocation, 150 graph=self.graph_data_model, 151 title=title, 152 root=rootNode, 153 additional_metadata=additionalMetadata, 154 use_annotations=useAnnotations, 155 ) 156 157 if datasetId: 158 return mg.get_manifest( 159 dataset_id=datasetId, json_schema=jsonSchema, sheet_url=sheetUrl 160 ) 161 162 return mg.get_manifest(sheet_url=sheetUrl) 163 164 def get_component_requirements( 165 self, source_component: str, as_graph: bool = False 166 ) -> List: 167 """Given a source model component (see https://w3id.org/biolink/vocab/category for definnition of component), return all components required by it. 168 Useful to construct requirement dependencies not only between specific attributes but also between categories/components of attributes; 169 Can be utilized to track metadata completion progress across multiple categories of attributes. 170 171 Args: 172 source_component: an attribute label indicating the source component. 173 as_graph: if False return component requirements as a list; if True return component requirements as a dependency graph (i.e. a DAG) 174 175 Returns: 176 A list of required components associated with the source component. 177 """ 178 179 # get required components for the input/source component 180 req_components = self.dmge.get_component_requirements(source_component) 181 182 # retreive components as graph 183 if as_graph: 184 req_components_graph = self.dmge.get_component_requirements_graph( 185 source_component 186 ) 187 188 # serialize component dependencies DAG to a edge list of node tuples 189 req_components = list(req_components_graph.edges()) 190 191 return req_components 192 193 return req_components 194 195 # TODO: abstract validation in its own module 196 @tracer.start_as_current_span("MetadataModel::validateModelManifest") 197 def validateModelManifest( 198 self, 199 manifestPath: str, 200 rootNode: str, 201 restrict_rules: bool = False, 202 jsonSchema: Optional[str] = None, 203 project_scope: Optional[List] = None, 204 dataset_scope: Optional[str] = None, 205 access_token: Optional[str] = None, 206 ) -> tuple[list, list]: 207 """Check if provided annotations manifest dataframe satisfies all model requirements. 208 209 Args: 210 rootNode: a schema node label (i.e. term). 211 manifestPath: a path to the manifest csv file containing annotations. 212 restrict_rules: bypass great expectations and restrict rule options to those implemented in house 213 214 Returns: 215 A validation status message; if there is an error the message. 216 contains the manifest annotation record (i.e. row) that is invalid, along with the validation error associated with this record. 217 218 Raises: 219 ValueError: rootNode not found in metadata model. 220 """ 221 # get validation schema for a given node in the data model, if the user has not provided input validation schema 222 223 if not jsonSchema: 224 # Instantiate Data Model Json Schema 225 self.data_model_js = DataModelJSONSchema( 226 jsonld_path=self.inputMModelLocation, graph=self.graph_data_model 227 ) 228 229 jsonSchema = self.data_model_js.get_json_validation_schema( 230 rootNode, rootNode + "_validation" 231 ) 232 233 errors = [] 234 warnings = [] 235 236 load_args = { 237 "dtype": "string", 238 } 239 # get annotations from manifest (array of json annotations corresponding to manifest rows) 240 manifest = load_df( 241 manifestPath, 242 preserve_raw_input=False, 243 allow_na_values=True, 244 **load_args, 245 ) # read manifest csv file as is from manifest path 246 247 # handler for mismatched components/data types 248 # throw TypeError if the value(s) in the "Component" column differ from the selected template type 249 if ("Component" in manifest.columns) and ( 250 (len(manifest["Component"].unique()) > 1) 251 or (manifest["Component"].unique()[0] != rootNode) 252 ): 253 logging.error( 254 f"The 'Component' column value(s) {manifest['Component'].unique()} do not match the " 255 f"selected template type '{rootNode}'." 256 ) 257 258 # row indexes for all rows where 'Component' is rootNode 259 row_idxs = manifest.index[manifest["Component"] != rootNode].tolist() 260 # column index value for the 'Component' column 261 col_idx = manifest.columns.get_loc("Component") 262 # Series with index and 'Component' values from manifest 263 mismatched_ser = manifest.iloc[row_idxs, col_idx] 264 for index, component in mismatched_ser.items(): 265 errors.append( 266 [ 267 index + 2, 268 "Component", 269 f"Component value provided is: '{component}', whereas the Template Type is: '{rootNode}'", 270 # tuple of the component in the manifest and selected template type 271 # check: R/Reticulate cannnot handle dicts? So returning tuple 272 (component, rootNode), 273 ] 274 ) 275 276 return errors, warnings 277 278 errors, warnings, manifest = validate_all( 279 self, 280 errors=errors, 281 warnings=warnings, 282 manifest=manifest, 283 manifestPath=manifestPath, 284 dmge=self.dmge, 285 jsonSchema=jsonSchema, 286 restrict_rules=restrict_rules, 287 project_scope=project_scope, 288 dataset_scope=dataset_scope, 289 access_token=access_token, 290 ) 291 return errors, warnings 292 293 def populateModelManifest( 294 self, title, manifestPath: str, rootNode: str, return_excel=False 295 ) -> str: 296 """Populate an existing annotations manifest based on a dataframe. 297 TODO: Remove this method; always use getModelManifest instead 298 299 Args: 300 rootNode: a schema node label (i.e. term). 301 manifestPath: a path to the manifest csv file containing annotations. 302 303 Returns: 304 A link to the filled in model manifest (e.g. google sheet). 305 306 Raises: 307 ValueError: rootNode not found in metadata model. 308 """ 309 mg = ManifestGenerator( 310 path_to_data_model=self.inputMModelLocation, 311 graph=self.graph_data_model, 312 title=title, 313 root=rootNode, 314 ) 315 316 emptyManifestURL = mg.get_manifest() 317 318 return mg.populate_manifest_spreadsheet( 319 manifestPath, emptyManifestURL, return_excel=return_excel, title=title 320 ) 321 322 @tracer.start_as_current_span("MetadataModel::submit_metadata_manifest") 323 def submit_metadata_manifest( # pylint: disable=too-many-arguments, too-many-locals 324 self, 325 manifest_path: str, 326 dataset_id: str, 327 manifest_record_type: str, 328 restrict_rules: bool, 329 access_token: Optional[str] = None, 330 validate_component: Optional[str] = None, 331 file_annotations_upload: bool = True, 332 hide_blanks: bool = False, 333 project_scope: Optional[list] = None, 334 dataset_scope: Optional[str] = None, 335 table_manipulation: str = "replace", 336 table_column_names: str = "class_label", 337 annotation_keys: str = "class_label", 338 ) -> str: 339 """ 340 Wrap methods that are responsible for validation of manifests for a given component, 341 and association of the same manifest file with a specified dataset. 342 343 Args: 344 manifest_path (str): Path to the manifest file, which contains the metadata. 345 dataset_id (str): Synapse ID of the dataset on Synapse containing the 346 metadata manifest file. 347 manifest_record_type (str): How the manifest is stored in Synapse 348 restrict_rules (bool): 349 If True: bypass great expectations and restrict rule options to 350 those implemented in house 351 access_token (Optional[str], optional): Defaults to None. 352 validate_component (Optional[str], optional): Component from the schema.org 353 schema based on which the manifest template has been generated. 354 file_annotations_upload (bool, optional): Default to True. If false, do 355 not add annotations to files. Defaults to True. 356 hide_blanks (bool, optional): Defaults to False. 357 project_scope (Optional[list], optional): Defaults to None. 358 table_manipulation (str, optional): Defaults to "replace". 359 table_column_names (str, optional): Defaults to "class_label". 360 annotation_keys (str, optional): Defaults to "class_label". 361 362 Raises: 363 ValueError: When validate_component is provided, but it cannot be found in the schema. 364 ValidationError: If validation against data model was not successful. 365 366 Returns: 367 str: If both validation and association were successful. 368 """ 369 # TODO: avoid explicitly exposing Synapse store functionality 370 # just instantiate a Store class and let it decide at runtime/config 371 # the store type 372 syn_store = SynapseStorage( 373 access_token=access_token, project_scope=project_scope 374 ) 375 manifest_id = None 376 restrict_maniest = False 377 censored_manifest_path = manifest_path.replace(".csv", "_censored.csv") 378 # check if user wants to perform validation or not 379 if validate_component is not None: 380 try: 381 # check if the component ("class" in schema) passed as argument is valid 382 # (present in schema) or not 383 self.dmge.is_class_in_schema(validate_component) 384 except Exception as exc: 385 # a KeyError exception is raised when validate_component fails in the 386 # try-block above here, we are suppressing the KeyError exception and 387 # replacing it with a more descriptive ValueError exception 388 raise ValueError( 389 f"The component '{validate_component}' could not be found " 390 f"in the schema here '{self.path_to_json_ld}'" 391 ) from exc 392 393 # automatic JSON schema generation and validation with that JSON schema 394 val_errors, _ = self.validateModelManifest( 395 manifestPath=manifest_path, 396 rootNode=validate_component, 397 restrict_rules=restrict_rules, 398 project_scope=project_scope, 399 dataset_scope=dataset_scope, 400 access_token=access_token, 401 ) 402 403 # if there are no errors in validation process 404 if val_errors == []: 405 # upload manifest file from `manifest_path` path to entity with Syn ID `dataset_id` 406 if os.path.exists(censored_manifest_path): 407 syn_store.associateMetadataWithFiles( 408 dmge=self.dmge, 409 metadataManifestPath=censored_manifest_path, 410 datasetId=dataset_id, 411 manifest_record_type=manifest_record_type, 412 hideBlanks=hide_blanks, 413 table_manipulation=table_manipulation, 414 table_column_names=table_column_names, 415 annotation_keys=annotation_keys, 416 file_annotations_upload=file_annotations_upload, 417 ) 418 restrict_maniest = True 419 420 manifest_id = syn_store.associateMetadataWithFiles( 421 dmge=self.dmge, 422 metadataManifestPath=manifest_path, 423 datasetId=dataset_id, 424 manifest_record_type=manifest_record_type, 425 hideBlanks=hide_blanks, 426 restrict_manifest=restrict_maniest, 427 table_manipulation=table_manipulation, 428 table_column_names=table_column_names, 429 annotation_keys=annotation_keys, 430 file_annotations_upload=file_annotations_upload, 431 ) 432 433 logger.info("No validation errors occured during validation.") 434 return manifest_id 435 436 else: 437 raise ValidationError( 438 "Manifest could not be validated under provided data model. " 439 f"Validation failed with the following errors: {val_errors}" 440 ) 441 442 # no need to perform validation, just submit/associate the metadata manifest file 443 if os.path.exists(censored_manifest_path): 444 syn_store.associateMetadataWithFiles( 445 dmge=self.dmge, 446 metadataManifestPath=censored_manifest_path, 447 datasetId=dataset_id, 448 manifest_record_type=manifest_record_type, 449 hideBlanks=hide_blanks, 450 table_manipulation=table_manipulation, 451 table_column_names=table_column_names, 452 annotation_keys=annotation_keys, 453 file_annotations_upload=file_annotations_upload, 454 ) 455 restrict_maniest = True 456 457 manifest_id = syn_store.associateMetadataWithFiles( 458 dmge=self.dmge, 459 metadataManifestPath=manifest_path, 460 datasetId=dataset_id, 461 manifest_record_type=manifest_record_type, 462 hideBlanks=hide_blanks, 463 restrict_manifest=restrict_maniest, 464 table_manipulation=table_manipulation, 465 table_column_names=table_column_names, 466 annotation_keys=annotation_keys, 467 file_annotations_upload=file_annotations_upload, 468 ) 469 470 logger.debug( 471 "Optional validation was not performed on manifest before association." 472 ) 473 474 return manifest_id
Metadata model wrapper around schema.org specification graph.
Provides basic utilities to:
1) manipulate the metadata model 2) generate metadata model views: - generate manifest view of the metadata model - generate validation schema view of the metadata model
40 def __init__( 41 self, 42 inputMModelLocation: str, 43 inputMModelLocationType: str, 44 data_model_labels: str, 45 ) -> None: 46 """Instantiates a MetadataModel object. 47 48 Args: 49 inputMModelLocation: local path, uri, synapse entity id (e.g. gs://, syn123, /User/x/…); present location 50 inputMModelLocationType: specifier to indicate where the metadata model resource can be found (e.g. 'local' if file/JSON-LD is on local machine) 51 """ 52 # extract extension of 'inputMModelLocation' 53 # ensure that it is necessarily pointing to a '.jsonld' file 54 55 logger.debug( 56 f"Initializing DataModelGraphExplorer object from {inputMModelLocation} schema." 57 ) 58 59 # self.inputMModelLocation remains for backwards compatibility 60 self.inputMModelLocation = inputMModelLocation 61 self.path_to_json_ld = inputMModelLocation 62 63 data_model_parser = DataModelParser(path_to_data_model=self.inputMModelLocation) 64 # Parse Model 65 parsed_data_model = data_model_parser.parse_model() 66 67 # Instantiate DataModelGraph 68 data_model_grapher = DataModelGraph(parsed_data_model, data_model_labels) 69 70 # Generate graph 71 self.graph_data_model = data_model_grapher.graph 72 73 self.dmge = DataModelGraphExplorer(self.graph_data_model) 74 75 # check if the type of MModel file is "local" 76 # currently, the application only supports reading from local JSON-LD files 77 if inputMModelLocationType == "local": 78 self.inputMModelLocationType = inputMModelLocationType 79 else: 80 raise ValueError( 81 f"The type '{inputMModelLocationType}' is currently not supported." 82 )
Instantiates a MetadataModel object.
Arguments:
- inputMModelLocation: local path, uri, synapse entity id (e.g. gs://, syn123, /User/x/…); present location
- inputMModelLocationType: specifier to indicate where the metadata model resource can be found (e.g. 'local' if file/JSON-LD is on local machine)
84 def getModelSubgraph(self, rootNode: str, subgraphType: str) -> nx.DiGraph: 85 """Gets a schema subgraph from rootNode descendants based on edge/node properties of type subgraphType. 86 87 Args: 88 rootNode: a schema node label (i.e. term). 89 subgraphType: the kind of subgraph to traverse (i.e. based on node properties or edge labels). 90 91 Returns: 92 A directed subgraph (networkx DiGraph) of the metadata model with vertex set root node descendants. 93 94 Raises: 95 ValueError: rootNode not found in metadata model. 96 """ 97 pass
Gets a schema subgraph from rootNode descendants based on edge/node properties of type subgraphType.
Arguments:
- rootNode: a schema node label (i.e. term).
- subgraphType: the kind of subgraph to traverse (i.e. based on node properties or edge labels).
Returns:
A directed subgraph (networkx DiGraph) of the metadata model with vertex set root node descendants.
Raises:
- ValueError: rootNode not found in metadata model.
99 def getOrderedModelNodes(self, rootNode: str, relationshipType: str) -> List[str]: 100 """Get a list of model objects ordered by their topological sort rank in a model subgraph on edges of a given relationship type. 101 102 Args: 103 rootNode: a schema object/node label (i.e. term) 104 relationshipType: edge label type of the schema subgraph (e.g. requiresDependency) 105 106 Returns: 107 An ordered list of objects, that are all descendants of rootNode. 108 109 Raises: 110 ValueError: rootNode not found in metadata model. 111 """ 112 ordered_nodes = self.dmge.get_descendants_by_edge_type( 113 rootNode, relationshipType, connected=True, ordered=True 114 ) 115 116 ordered_nodes.reverse() 117 118 return ordered_nodes
Get a list of model objects ordered by their topological sort rank in a model subgraph on edges of a given relationship type.
Arguments:
- rootNode: a schema object/node label (i.e. term)
- relationshipType: edge label type of the schema subgraph (e.g. requiresDependency)
Returns:
An ordered list of objects, that are all descendants of rootNode.
Raises:
- ValueError: rootNode not found in metadata model.
120 def getModelManifest( 121 self, 122 title: str, 123 rootNode: str, 124 datasetId: str = None, 125 jsonSchema: str = None, 126 filenames: list = None, 127 useAnnotations: bool = False, 128 sheetUrl: bool = True, 129 ) -> str: 130 """Gets data from the annotations manifest file. 131 132 TBD: Does this method belong here or in manifest generator? 133 134 Args: 135 rootNode: a schema node label (i.e. term). 136 useAnnotations: whether to populate manifest with current file annotations (True) or not (False, default). 137 138 Returns: 139 A manifest URI (assume Google doc for now). 140 141 Raises: 142 ValueError: rootNode not found in metadata model. 143 """ 144 additionalMetadata = {} 145 if filenames: 146 additionalMetadata["Filename"] = filenames 147 148 mg = ManifestGenerator( 149 path_to_json_ld=self.inputMModelLocation, 150 graph=self.graph_data_model, 151 title=title, 152 root=rootNode, 153 additional_metadata=additionalMetadata, 154 use_annotations=useAnnotations, 155 ) 156 157 if datasetId: 158 return mg.get_manifest( 159 dataset_id=datasetId, json_schema=jsonSchema, sheet_url=sheetUrl 160 ) 161 162 return mg.get_manifest(sheet_url=sheetUrl)
Gets data from the annotations manifest file.
TBD: Does this method belong here or in manifest generator?
Arguments:
- rootNode: a schema node label (i.e. term).
- useAnnotations: whether to populate manifest with current file annotations (True) or not (False, default).
Returns:
A manifest URI (assume Google doc for now).
Raises:
- ValueError: rootNode not found in metadata model.
164 def get_component_requirements( 165 self, source_component: str, as_graph: bool = False 166 ) -> List: 167 """Given a source model component (see https://w3id.org/biolink/vocab/category for definnition of component), return all components required by it. 168 Useful to construct requirement dependencies not only between specific attributes but also between categories/components of attributes; 169 Can be utilized to track metadata completion progress across multiple categories of attributes. 170 171 Args: 172 source_component: an attribute label indicating the source component. 173 as_graph: if False return component requirements as a list; if True return component requirements as a dependency graph (i.e. a DAG) 174 175 Returns: 176 A list of required components associated with the source component. 177 """ 178 179 # get required components for the input/source component 180 req_components = self.dmge.get_component_requirements(source_component) 181 182 # retreive components as graph 183 if as_graph: 184 req_components_graph = self.dmge.get_component_requirements_graph( 185 source_component 186 ) 187 188 # serialize component dependencies DAG to a edge list of node tuples 189 req_components = list(req_components_graph.edges()) 190 191 return req_components 192 193 return req_components
Given a source model component (see https://w3id.org/biolink/vocab/category for definnition of component), return all components required by it. Useful to construct requirement dependencies not only between specific attributes but also between categories/components of attributes; Can be utilized to track metadata completion progress across multiple categories of attributes.
Arguments:
- source_component: an attribute label indicating the source component.
- as_graph: if False return component requirements as a list; if True return component requirements as a dependency graph (i.e. a DAG)
Returns:
A list of required components associated with the source component.
196 @tracer.start_as_current_span("MetadataModel::validateModelManifest") 197 def validateModelManifest( 198 self, 199 manifestPath: str, 200 rootNode: str, 201 restrict_rules: bool = False, 202 jsonSchema: Optional[str] = None, 203 project_scope: Optional[List] = None, 204 dataset_scope: Optional[str] = None, 205 access_token: Optional[str] = None, 206 ) -> tuple[list, list]: 207 """Check if provided annotations manifest dataframe satisfies all model requirements. 208 209 Args: 210 rootNode: a schema node label (i.e. term). 211 manifestPath: a path to the manifest csv file containing annotations. 212 restrict_rules: bypass great expectations and restrict rule options to those implemented in house 213 214 Returns: 215 A validation status message; if there is an error the message. 216 contains the manifest annotation record (i.e. row) that is invalid, along with the validation error associated with this record. 217 218 Raises: 219 ValueError: rootNode not found in metadata model. 220 """ 221 # get validation schema for a given node in the data model, if the user has not provided input validation schema 222 223 if not jsonSchema: 224 # Instantiate Data Model Json Schema 225 self.data_model_js = DataModelJSONSchema( 226 jsonld_path=self.inputMModelLocation, graph=self.graph_data_model 227 ) 228 229 jsonSchema = self.data_model_js.get_json_validation_schema( 230 rootNode, rootNode + "_validation" 231 ) 232 233 errors = [] 234 warnings = [] 235 236 load_args = { 237 "dtype": "string", 238 } 239 # get annotations from manifest (array of json annotations corresponding to manifest rows) 240 manifest = load_df( 241 manifestPath, 242 preserve_raw_input=False, 243 allow_na_values=True, 244 **load_args, 245 ) # read manifest csv file as is from manifest path 246 247 # handler for mismatched components/data types 248 # throw TypeError if the value(s) in the "Component" column differ from the selected template type 249 if ("Component" in manifest.columns) and ( 250 (len(manifest["Component"].unique()) > 1) 251 or (manifest["Component"].unique()[0] != rootNode) 252 ): 253 logging.error( 254 f"The 'Component' column value(s) {manifest['Component'].unique()} do not match the " 255 f"selected template type '{rootNode}'." 256 ) 257 258 # row indexes for all rows where 'Component' is rootNode 259 row_idxs = manifest.index[manifest["Component"] != rootNode].tolist() 260 # column index value for the 'Component' column 261 col_idx = manifest.columns.get_loc("Component") 262 # Series with index and 'Component' values from manifest 263 mismatched_ser = manifest.iloc[row_idxs, col_idx] 264 for index, component in mismatched_ser.items(): 265 errors.append( 266 [ 267 index + 2, 268 "Component", 269 f"Component value provided is: '{component}', whereas the Template Type is: '{rootNode}'", 270 # tuple of the component in the manifest and selected template type 271 # check: R/Reticulate cannnot handle dicts? So returning tuple 272 (component, rootNode), 273 ] 274 ) 275 276 return errors, warnings 277 278 errors, warnings, manifest = validate_all( 279 self, 280 errors=errors, 281 warnings=warnings, 282 manifest=manifest, 283 manifestPath=manifestPath, 284 dmge=self.dmge, 285 jsonSchema=jsonSchema, 286 restrict_rules=restrict_rules, 287 project_scope=project_scope, 288 dataset_scope=dataset_scope, 289 access_token=access_token, 290 ) 291 return errors, warnings
Check if provided annotations manifest dataframe satisfies all model requirements.
Arguments:
- rootNode: a schema node label (i.e. term).
- manifestPath: a path to the manifest csv file containing annotations.
- restrict_rules: bypass great expectations and restrict rule options to those implemented in house
Returns:
A validation status message; if there is an error the message. contains the manifest annotation record (i.e. row) that is invalid, along with the validation error associated with this record.
Raises:
- ValueError: rootNode not found in metadata model.
293 def populateModelManifest( 294 self, title, manifestPath: str, rootNode: str, return_excel=False 295 ) -> str: 296 """Populate an existing annotations manifest based on a dataframe. 297 TODO: Remove this method; always use getModelManifest instead 298 299 Args: 300 rootNode: a schema node label (i.e. term). 301 manifestPath: a path to the manifest csv file containing annotations. 302 303 Returns: 304 A link to the filled in model manifest (e.g. google sheet). 305 306 Raises: 307 ValueError: rootNode not found in metadata model. 308 """ 309 mg = ManifestGenerator( 310 path_to_data_model=self.inputMModelLocation, 311 graph=self.graph_data_model, 312 title=title, 313 root=rootNode, 314 ) 315 316 emptyManifestURL = mg.get_manifest() 317 318 return mg.populate_manifest_spreadsheet( 319 manifestPath, emptyManifestURL, return_excel=return_excel, title=title 320 )
Populate an existing annotations manifest based on a dataframe. TODO: Remove this method; always use getModelManifest instead
Arguments:
- rootNode: a schema node label (i.e. term).
- manifestPath: a path to the manifest csv file containing annotations.
Returns:
A link to the filled in model manifest (e.g. google sheet).
Raises:
- ValueError: rootNode not found in metadata model.
322 @tracer.start_as_current_span("MetadataModel::submit_metadata_manifest") 323 def submit_metadata_manifest( # pylint: disable=too-many-arguments, too-many-locals 324 self, 325 manifest_path: str, 326 dataset_id: str, 327 manifest_record_type: str, 328 restrict_rules: bool, 329 access_token: Optional[str] = None, 330 validate_component: Optional[str] = None, 331 file_annotations_upload: bool = True, 332 hide_blanks: bool = False, 333 project_scope: Optional[list] = None, 334 dataset_scope: Optional[str] = None, 335 table_manipulation: str = "replace", 336 table_column_names: str = "class_label", 337 annotation_keys: str = "class_label", 338 ) -> str: 339 """ 340 Wrap methods that are responsible for validation of manifests for a given component, 341 and association of the same manifest file with a specified dataset. 342 343 Args: 344 manifest_path (str): Path to the manifest file, which contains the metadata. 345 dataset_id (str): Synapse ID of the dataset on Synapse containing the 346 metadata manifest file. 347 manifest_record_type (str): How the manifest is stored in Synapse 348 restrict_rules (bool): 349 If True: bypass great expectations and restrict rule options to 350 those implemented in house 351 access_token (Optional[str], optional): Defaults to None. 352 validate_component (Optional[str], optional): Component from the schema.org 353 schema based on which the manifest template has been generated. 354 file_annotations_upload (bool, optional): Default to True. If false, do 355 not add annotations to files. Defaults to True. 356 hide_blanks (bool, optional): Defaults to False. 357 project_scope (Optional[list], optional): Defaults to None. 358 table_manipulation (str, optional): Defaults to "replace". 359 table_column_names (str, optional): Defaults to "class_label". 360 annotation_keys (str, optional): Defaults to "class_label". 361 362 Raises: 363 ValueError: When validate_component is provided, but it cannot be found in the schema. 364 ValidationError: If validation against data model was not successful. 365 366 Returns: 367 str: If both validation and association were successful. 368 """ 369 # TODO: avoid explicitly exposing Synapse store functionality 370 # just instantiate a Store class and let it decide at runtime/config 371 # the store type 372 syn_store = SynapseStorage( 373 access_token=access_token, project_scope=project_scope 374 ) 375 manifest_id = None 376 restrict_maniest = False 377 censored_manifest_path = manifest_path.replace(".csv", "_censored.csv") 378 # check if user wants to perform validation or not 379 if validate_component is not None: 380 try: 381 # check if the component ("class" in schema) passed as argument is valid 382 # (present in schema) or not 383 self.dmge.is_class_in_schema(validate_component) 384 except Exception as exc: 385 # a KeyError exception is raised when validate_component fails in the 386 # try-block above here, we are suppressing the KeyError exception and 387 # replacing it with a more descriptive ValueError exception 388 raise ValueError( 389 f"The component '{validate_component}' could not be found " 390 f"in the schema here '{self.path_to_json_ld}'" 391 ) from exc 392 393 # automatic JSON schema generation and validation with that JSON schema 394 val_errors, _ = self.validateModelManifest( 395 manifestPath=manifest_path, 396 rootNode=validate_component, 397 restrict_rules=restrict_rules, 398 project_scope=project_scope, 399 dataset_scope=dataset_scope, 400 access_token=access_token, 401 ) 402 403 # if there are no errors in validation process 404 if val_errors == []: 405 # upload manifest file from `manifest_path` path to entity with Syn ID `dataset_id` 406 if os.path.exists(censored_manifest_path): 407 syn_store.associateMetadataWithFiles( 408 dmge=self.dmge, 409 metadataManifestPath=censored_manifest_path, 410 datasetId=dataset_id, 411 manifest_record_type=manifest_record_type, 412 hideBlanks=hide_blanks, 413 table_manipulation=table_manipulation, 414 table_column_names=table_column_names, 415 annotation_keys=annotation_keys, 416 file_annotations_upload=file_annotations_upload, 417 ) 418 restrict_maniest = True 419 420 manifest_id = syn_store.associateMetadataWithFiles( 421 dmge=self.dmge, 422 metadataManifestPath=manifest_path, 423 datasetId=dataset_id, 424 manifest_record_type=manifest_record_type, 425 hideBlanks=hide_blanks, 426 restrict_manifest=restrict_maniest, 427 table_manipulation=table_manipulation, 428 table_column_names=table_column_names, 429 annotation_keys=annotation_keys, 430 file_annotations_upload=file_annotations_upload, 431 ) 432 433 logger.info("No validation errors occured during validation.") 434 return manifest_id 435 436 else: 437 raise ValidationError( 438 "Manifest could not be validated under provided data model. " 439 f"Validation failed with the following errors: {val_errors}" 440 ) 441 442 # no need to perform validation, just submit/associate the metadata manifest file 443 if os.path.exists(censored_manifest_path): 444 syn_store.associateMetadataWithFiles( 445 dmge=self.dmge, 446 metadataManifestPath=censored_manifest_path, 447 datasetId=dataset_id, 448 manifest_record_type=manifest_record_type, 449 hideBlanks=hide_blanks, 450 table_manipulation=table_manipulation, 451 table_column_names=table_column_names, 452 annotation_keys=annotation_keys, 453 file_annotations_upload=file_annotations_upload, 454 ) 455 restrict_maniest = True 456 457 manifest_id = syn_store.associateMetadataWithFiles( 458 dmge=self.dmge, 459 metadataManifestPath=manifest_path, 460 datasetId=dataset_id, 461 manifest_record_type=manifest_record_type, 462 hideBlanks=hide_blanks, 463 restrict_manifest=restrict_maniest, 464 table_manipulation=table_manipulation, 465 table_column_names=table_column_names, 466 annotation_keys=annotation_keys, 467 file_annotations_upload=file_annotations_upload, 468 ) 469 470 logger.debug( 471 "Optional validation was not performed on manifest before association." 472 ) 473 474 return manifest_id
Wrap methods that are responsible for validation of manifests for a given component, and association of the same manifest file with a specified dataset.
Arguments:
- manifest_path (str): Path to the manifest file, which contains the metadata.
- dataset_id (str): Synapse ID of the dataset on Synapse containing the metadata manifest file.
- manifest_record_type (str): How the manifest is stored in Synapse
- restrict_rules (bool): If True: bypass great expectations and restrict rule options to those implemented in house
- access_token (Optional[str], optional): Defaults to None.
- validate_component (Optional[str], optional): Component from the schema.org schema based on which the manifest template has been generated.
- file_annotations_upload (bool, optional): Default to True. If false, do not add annotations to files. Defaults to True.
- hide_blanks (bool, optional): Defaults to False.
- project_scope (Optional[list], optional): Defaults to None.
- table_manipulation (str, optional): Defaults to "replace".
- table_column_names (str, optional): Defaults to "class_label".
- annotation_keys (str, optional): Defaults to "class_label".
Raises:
- ValueError: When validate_component is provided, but it cannot be found in the schema.
- ValidationError: If validation against data model was not successful.
Returns:
str: If both validation and association were successful.