schematic.store.synapse
Synapse storage class
1"""Synapse storage class""" 2 3import asyncio 4import atexit 5import logging 6import os 7import re 8import secrets 9import shutil 10import time 11import uuid # used to generate unique names for entities 12from copy import deepcopy 13from dataclasses import dataclass, field 14from time import sleep 15 16# allows specifying explicit variable types 17from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union 18 19import numpy as np 20import pandas as pd 21import synapseclient 22from opentelemetry import trace 23from synapseclient import Annotations as OldAnnotations 24from synapseclient import ( 25 Column, 26 EntityViewSchema, 27 EntityViewType, 28 File, 29 Folder, 30 Schema, 31 Synapse, 32 Table, 33 as_table_columns, 34) 35from synapseclient.annotations import _convert_to_annotations_list 36from synapseclient.api import get_config_file, get_entity_id_bundle2 37from synapseclient.core.constants.concrete_types import PROJECT_ENTITY 38from synapseclient.core.exceptions import ( 39 SynapseAuthenticationError, 40 SynapseHTTPError, 41 SynapseUnmetAccessRestrictions, 42) 43from synapseclient.models.annotations import Annotations 44from synapseclient.table import CsvFileTable, Schema, build_table 45from tenacity import ( 46 retry, 47 retry_if_exception_type, 48 stop_after_attempt, 49 wait_chain, 50 wait_fixed, 51) 52 53from schematic.configuration.configuration import CONFIG 54from schematic.exceptions import AccessCredentialsError 55from schematic.schemas.data_model_graph import DataModelGraphExplorer 56from schematic.store.base import BaseStorage 57from schematic.store.database.synapse_database import SynapseDatabase 58from schematic.store.synapse_tracker import SynapseEntityTracker 59from schematic.utils.df_utils import ( 60 STR_NA_VALUES_FILTERED, 61 col_in_dataframe, 62 load_df, 63 update_df, 64) 65 66# entity_type_mapping, get_dir_size, create_temp_folder, check_synapse_cache_size, and clear_synapse_cache functions are used for AWS deployment 67# Please do not remove these import statements 68from schematic.utils.general import ( 69 check_synapse_cache_size, 70 clear_synapse_cache, 71 create_temp_folder, 72 entity_type_mapping, 73 get_dir_size, 74 create_like_statement, 75) 76from schematic.utils.io_utils import cleanup_temporary_storage 77from schematic.utils.schema_utils import get_class_label_from_display_name 78from schematic.utils.validate_utils import comma_separated_list_regex, rule_in_rule_list 79 80 81logger = logging.getLogger("Synapse storage") 82 83tracer = trace.get_tracer("Schematic") 84 85ID_COLUMN = "Id" 86ENTITY_ID_COLUMN = "entityId" 87UUID_COLUMN = "uuid" 88 89 90@dataclass 91class ManifestDownload(object): 92 """ 93 syn: an object of type synapseclient. 94 manifest_id: id of a manifest 95 synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities 96 """ 97 98 syn: synapseclient.Synapse 99 manifest_id: str 100 synapse_entity_tracker: SynapseEntityTracker = field( 101 default_factory=SynapseEntityTracker 102 ) 103 104 def _download_manifest_to_folder(self, use_temporary_folder: bool = True) -> File: 105 """ 106 Try downloading a manifest to a specific folder (temporary or not). When the 107 `use_temporary_folder` is set to True, the manifest will be downloaded to a 108 temporary folder. This is useful for when the code is running as an API server 109 where multiple requests are being made at the same time. This will prevent 110 multiple requests from overwriting the same manifest file. When the 111 `use_temporary_folder` is set to False, the manifest will be downloaded to the 112 default manifest folder. 113 114 Args: 115 use_temporary_folder: boolean argument indicating if a temporary folder 116 should be used to store the manifest file. This is useful when running 117 this code as an API server where multiple requests could be made at the 118 same time. This is set to False when the code is being used from the 119 CLI. Defaults to True. 120 121 Return: 122 manifest_data: A Synapse file entity of the downloaded manifest 123 """ 124 manifest_data = self.synapse_entity_tracker.get( 125 synapse_id=self.manifest_id, 126 syn=self.syn, 127 download_file=False, 128 retrieve_if_not_present=False, 129 ) 130 current_span = trace.get_current_span() 131 if ( 132 manifest_data 133 and (file_handle := manifest_data.get("_file_handle", None)) 134 and current_span.is_recording() 135 ): 136 current_span.set_attribute( 137 "schematic.manifest_size", file_handle.get("contentSize", 0) 138 ) 139 140 if manifest_data and manifest_data.path: 141 return manifest_data 142 143 if "SECRETS_MANAGER_SECRETS" in os.environ: 144 temporary_manifest_storage = "/var/tmp/temp_manifest_download" 145 cleanup_temporary_storage( 146 temporary_manifest_storage, time_delta_seconds=3600 147 ) 148 # create a new directory to store manifest 149 if not os.path.exists(temporary_manifest_storage): 150 os.mkdir(temporary_manifest_storage) 151 # create temporary folders for storing manifests 152 download_location = create_temp_folder( 153 path=temporary_manifest_storage, 154 prefix=f"{self.manifest_id}-{time.time()}-", 155 ) 156 else: 157 if use_temporary_folder: 158 download_location = create_temp_folder( 159 path=CONFIG.manifest_folder, 160 prefix=f"{self.manifest_id}-{time.time()}-", 161 ) 162 else: 163 download_location = CONFIG.manifest_folder 164 165 manifest_data = self.synapse_entity_tracker.get( 166 synapse_id=self.manifest_id, 167 syn=self.syn, 168 download_file=True, 169 retrieve_if_not_present=True, 170 download_location=download_location, 171 ) 172 173 # This is doing a rename of the downloaded file. The reason this is important 174 # is that if we are re-using a file that was previously downloaded, but the 175 # file had been renamed. The file downloaded from the Synapse client is just 176 # a direct copy of that renamed file. This code will set the name of the file 177 # to the original name that was used to download the file. Note: An MD5 checksum 178 # of the file will still be performed so if the file has changed, it will be 179 # downloaded again. 180 filename = manifest_data._file_handle.fileName 181 if filename != os.path.basename(manifest_data.path): 182 parent_folder = os.path.dirname(manifest_data.path) 183 manifest_original_name_and_path = os.path.join(parent_folder, filename) 184 185 self.syn.cache.remove( 186 file_handle_id=manifest_data.dataFileHandleId, path=manifest_data.path 187 ) 188 os.rename(manifest_data.path, manifest_original_name_and_path) 189 manifest_data.path = manifest_original_name_and_path 190 self.syn.cache.add( 191 file_handle_id=manifest_data.dataFileHandleId, 192 path=manifest_original_name_and_path, 193 md5=manifest_data._file_handle.contentMd5, 194 ) 195 196 return manifest_data 197 198 def _entity_type_checking(self) -> str: 199 """ 200 check the entity type of the id that needs to be downloaded 201 Return: 202 if the entity type is wrong, raise an error 203 """ 204 # check the type of entity 205 entity_type = entity_type_mapping( 206 syn=self.syn, 207 entity_id=self.manifest_id, 208 synapse_entity_tracker=self.synapse_entity_tracker, 209 ) 210 if entity_type != "file": 211 logger.error( 212 f"You are using entity type: {entity_type}. Please provide a file ID" 213 ) 214 215 def download_manifest( 216 self, 217 newManifestName: str = "", 218 manifest_df: pd.DataFrame = pd.DataFrame(), 219 use_temporary_folder: bool = True, 220 ) -> Union[str, File]: 221 """ 222 Download a manifest based on a given manifest id. 223 Args: 224 newManifestName(optional): new name of a manifest that gets downloaded. 225 manifest_df(optional): a dataframe containing name and id of manifests in a given asset view 226 Return: 227 manifest_data: synapse entity file object 228 """ 229 230 # enables retrying if user does not have access to uncensored manifest 231 # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location 232 manifest_data = "" 233 234 # check entity type 235 self._entity_type_checking() 236 237 # download a manifest 238 try: 239 manifest_data = self._download_manifest_to_folder( 240 use_temporary_folder=use_temporary_folder 241 ) 242 except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError): 243 # if there's an error getting an uncensored manifest, try getting the censored manifest 244 if not manifest_df.empty: 245 censored_regex = re.compile(".*censored.*") 246 censored = manifest_df["name"].str.contains(censored_regex) 247 new_manifest_id = manifest_df[censored]["id"][0] 248 self.manifest_id = new_manifest_id 249 try: 250 manifest_data = self._download_manifest_to_folder( 251 use_temporary_folder=use_temporary_folder 252 ) 253 except ( 254 SynapseUnmetAccessRestrictions, 255 SynapseAuthenticationError, 256 ) as e: 257 raise PermissionError( 258 "You don't have access to censored and uncensored manifests in this dataset." 259 ) from e 260 else: 261 logger.error( 262 f"You don't have access to the requested resource: {self.manifest_id}" 263 ) 264 265 if newManifestName and os.path.exists(manifest_data.get("path")): 266 # Rename the file we just made to the new name 267 new_manifest_filename = newManifestName + ".csv" 268 269 # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest. 270 parent_folder = os.path.dirname(manifest_data.get("path")) 271 272 new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename) 273 274 # Copy file to new location. The purpose of using a copy instead of a rename 275 # is to avoid any potential issues with the file being used in another 276 # process. This avoids any potential race or code cocurrency conditions. 277 shutil.copyfile(src=manifest_data["path"], dst=new_manifest_path_name) 278 279 # Adding this to cache will allow us to re-use the already downloaded 280 # manifest file for up to 1 hour. 281 self.syn.cache.add( 282 file_handle_id=manifest_data.dataFileHandleId, 283 path=new_manifest_path_name, 284 md5=manifest_data._file_handle.contentMd5, 285 ) 286 287 # Update file names/paths in manifest_data 288 manifest_data["name"] = new_manifest_filename 289 manifest_data["filename"] = new_manifest_filename 290 manifest_data["path"] = new_manifest_path_name 291 292 return manifest_data 293 294 295class SynapseStorage(BaseStorage): 296 """Implementation of Storage interface for datasets/files stored on Synapse. 297 Provides utilities to list files in a specific project; update files annotations, create fileviews, etc. 298 299 TODO: Need to define the interface and rename and/or refactor some of the methods below. 300 """ 301 302 @tracer.start_as_current_span("SynapseStorage::__init__") 303 def __init__( 304 self, 305 token: Optional[str] = None, # optional parameter retrieved from browser cookie 306 access_token: Optional[str] = None, 307 project_scope: Optional[list] = None, 308 synapse_cache_path: Optional[str] = None, 309 perform_query: Optional[bool] = True, 310 columns: Optional[list] = None, 311 where_clauses: Optional[list] = None, 312 ) -> None: 313 """Initializes a SynapseStorage object. 314 315 Args: 316 token (Optional[str], optional): 317 Optional token parameter as found in browser cookie upon login to synapse. 318 Defaults to None. 319 access_token (Optional[list], optional): 320 Optional access token (personal or oauth). 321 Defaults to None. 322 project_scope (Optional[list], optional): Defaults to None. 323 synapse_cache_path (Optional[str], optional): 324 Location of synapse cache. 325 Defaults to None. 326 TODO: 327 Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how `query_fileview` is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands. 328 """ 329 self.syn = self.login(synapse_cache_path, access_token) 330 self.project_scope = project_scope 331 self.storageFileview = CONFIG.synapse_master_fileview_id 332 self.manifest = CONFIG.synapse_manifest_basename 333 self.root_synapse_cache = self.syn.cache.cache_root_dir 334 self.synapse_entity_tracker = SynapseEntityTracker() 335 if perform_query: 336 self.query_fileview(columns=columns, where_clauses=where_clauses) 337 338 # TODO: When moving this over to a regular cron-job the following logic should be 339 # out of `manifest_download`: 340 # if "SECRETS_MANAGER_SECRETS" in os.environ: 341 # temporary_manifest_storage = "/var/tmp/temp_manifest_download" 342 # cleanup_temporary_storage(temporary_manifest_storage, time_delta_seconds=3600) 343 @tracer.start_as_current_span("SynapseStorage::_purge_synapse_cache") 344 def _purge_synapse_cache( 345 self, maximum_storage_allowed_cache_gb: int = 1, minute_buffer: int = 15 346 ) -> None: 347 """ 348 Purge synapse cache if it exceeds a certain size. Default to 1GB. 349 Args: 350 maximum_storage_allowed_cache_gb (int): the maximum storage allowed 351 before purging cache. Default is 1 GB. 352 minute_buffer (int): All files created this amount of time or older will be deleted 353 """ 354 # try clearing the cache 355 # scan a directory and check size of files 356 if os.path.exists(self.root_synapse_cache): 357 maximum_storage_allowed_cache_bytes = maximum_storage_allowed_cache_gb * ( 358 1024**3 359 ) 360 nbytes = get_dir_size(self.root_synapse_cache) 361 dir_size_bytes = check_synapse_cache_size(directory=self.root_synapse_cache) 362 # if 1 GB has already been taken, purge cache before 15 min 363 if dir_size_bytes >= maximum_storage_allowed_cache_bytes: 364 num_of_deleted_files = clear_synapse_cache( 365 self.syn.cache, minutes=minute_buffer 366 ) 367 logger.info( 368 f"{num_of_deleted_files} files have been deleted from {self.root_synapse_cache}" 369 ) 370 else: 371 # on AWS, OS takes around 14-17% of our ephemeral storage (20GiB) 372 # instead of guessing how much space that we left, print out .synapseCache here 373 logger.info(f"the total size of .synapseCache is: {nbytes} bytes") 374 375 @tracer.start_as_current_span("SynapseStorage::query_fileview") 376 def query_fileview( 377 self, 378 columns: Optional[list] = None, 379 where_clauses: Optional[list] = None, 380 force_requery: Optional[bool] = False, 381 ) -> None: 382 """ 383 Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute. 384 Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes. 385 Args: 386 columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns. 387 where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None. 388 force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False. 389 """ 390 self._purge_synapse_cache() 391 392 # Initialize to assume that the new fileview query will be different from what may already be stored. Initializes to True because generally one will not have already been performed 393 self.new_query_different = True 394 395 # If a query has already been performed, store the query 396 previous_query_built = hasattr(self, "fileview_query") 397 if previous_query_built: 398 previous_query = self.fileview_query 399 400 # Build a query with the current given parameters and check to see if it is different from the previous 401 self._build_query(columns=columns, where_clauses=where_clauses) 402 if previous_query_built: 403 self.new_query_different = self.fileview_query != previous_query 404 405 # Only perform the query if it is different from the previous query or we are forcing new results to be retrieved 406 if self.new_query_different or force_requery: 407 try: 408 self.storageFileviewTable = self.syn.tableQuery( 409 query=self.fileview_query, 410 ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) 411 except SynapseHTTPError as exc: 412 exception_text = str(exc) 413 if "Unknown column path" in exception_text: 414 raise ValueError( 415 "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation." 416 ) 417 elif "Unknown column" in exception_text: 418 missing_column = exception_text.split("Unknown column ")[-1] 419 raise ValueError( 420 f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview." 421 ) 422 else: 423 raise AccessCredentialsError(self.storageFileview) 424 425 @staticmethod 426 def build_clause_from_dataset_id( 427 dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None 428 ) -> str: 429 """ 430 Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized. 431 Args: 432 dataset_id: Synapse ID of a dataset that should be used to limit the query 433 dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query 434 Returns: 435 clause for the query or an empty string if no dataset ID is provided 436 """ 437 # Calling this method without specifying synIDs will complete but will not scope the view 438 if (not dataset_id) and (not dataset_folder_list): 439 return "" 440 441 # This will be used to gather files under a dataset recursively with a fileview query instead of walking 442 if dataset_folder_list: 443 search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list) 444 return f"parentId IN ({search_folders})" 445 446 # `dataset_id` should be provided when all files are stored directly under the dataset folder 447 return f"parentId='{dataset_id}'" 448 449 def _build_query( 450 self, columns: Optional[list] = None, where_clauses: Optional[list] = None 451 ): 452 """ 453 Method to build a query for Synapse FileViews 454 Args: 455 columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns. 456 where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None. 457 self.storageFileview (str): Synapse FileView ID 458 self.project_scope (Optional[list], optional): List of project IDs to be used to scope the query. Defaults to None. 459 Gets added to where_clauses, more included for backwards compatability and as a more user friendly way of subsetting the view in a simple way. 460 """ 461 if columns is None: 462 columns = [] 463 if where_clauses is None: 464 where_clauses = [] 465 466 if self.project_scope: 467 project_scope_clause = f"projectId IN {tuple(self.project_scope + [''])}" 468 where_clauses.append(project_scope_clause) 469 470 if where_clauses: 471 where_clauses = " AND ".join(where_clauses) 472 where_clauses = f"WHERE {where_clauses} ;" 473 else: 474 where_clauses = ";" 475 476 if columns: 477 columns = ",".join(columns) 478 else: 479 columns = "*" 480 481 self.fileview_query = ( 482 f"SELECT {columns} FROM {self.storageFileview} {where_clauses}" 483 ) 484 485 return 486 487 @staticmethod 488 @tracer.start_as_current_span("SynapseStorage::login") 489 def login( 490 synapse_cache_path: Optional[str] = None, 491 access_token: Optional[str] = None, 492 ) -> synapseclient.Synapse: 493 """Login to Synapse 494 495 Args: 496 access_token (Optional[str], optional): A synapse access token. Defaults to None. 497 synapse_cache_path (Optional[str]): location of synapse cache 498 499 Raises: 500 ValueError: If unable to loging with access token 501 502 Returns: 503 synapseclient.Synapse: A Synapse object that is logged in 504 """ 505 if not access_token: 506 access_token = os.getenv("SYNAPSE_ACCESS_TOKEN") 507 508 # login using a token 509 if access_token: 510 try: 511 syn = synapseclient.Synapse( 512 cache_root_dir=synapse_cache_path, 513 debug=False, 514 skip_checks=True, 515 cache_client=False, 516 ) 517 syn.login(authToken=access_token, silent=True) 518 except SynapseHTTPError as exc: 519 raise ValueError( 520 "No access to resources. Please make sure that your token is correct" 521 ) from exc 522 else: 523 # login using synapse credentials provided by user in .synapseConfig (default) file 524 syn = synapseclient.Synapse( 525 configPath=CONFIG.synapse_configuration_path, 526 cache_root_dir=synapse_cache_path, 527 debug=False, 528 skip_checks=True, 529 cache_client=False, 530 ) 531 syn.login(silent=True) 532 533 # set user id attribute 534 current_span = trace.get_current_span() 535 if current_span.is_recording(): 536 current_span.set_attribute("user.id", syn.credentials.owner_id) 537 538 return syn 539 540 def missing_entity_handler(method): 541 def wrapper(*args, **kwargs): 542 try: 543 return method(*args, **kwargs) 544 except SynapseHTTPError as ex: 545 str_message = str(ex).replace("\n", "") 546 if "trash" in str_message or "does not exist" in str_message: 547 logging.warning(str_message) 548 return None 549 else: 550 raise ex 551 552 return wrapper 553 554 def async_missing_entity_handler(method): 555 """Decorator to handle missing entities in async methods.""" 556 557 async def wrapper(*args: Any, **kwargs: Any) -> Any: 558 try: 559 return await method(*args, **kwargs) 560 except SynapseHTTPError as ex: 561 str_message = str(ex).replace("\n", "") 562 if "trash" in str_message or "does not exist" in str_message: 563 logging.warning(str_message) 564 return None 565 else: 566 raise ex 567 568 return wrapper 569 570 def getStorageFileviewTable(self): 571 """Returns the storageFileviewTable obtained during initialization.""" 572 return self.storageFileviewTable 573 574 def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]: 575 """Gets the paginated results of the REST call to Synapse to check what projects the current user has access to. 576 577 Args: 578 currentUserId: synapse id for the user whose projects we want to get. 579 580 Returns: 581 A dictionary with a next page token and the results. 582 """ 583 all_results = self.syn.restGET( 584 "/projects/user/{principalId}".format(principalId=currentUserId) 585 ) 586 587 while ( 588 "nextPageToken" in all_results 589 ): # iterate over next page token in results while there is any 590 results_token = self.syn.restGET( 591 "/projects/user/{principalId}?nextPageToken={nextPageToken}".format( 592 principalId=currentUserId, 593 nextPageToken=all_results["nextPageToken"], 594 ) 595 ) 596 all_results["results"].extend(results_token["results"]) 597 598 if "nextPageToken" in results_token: 599 all_results["nextPageToken"] = results_token["nextPageToken"] 600 else: 601 del all_results["nextPageToken"] 602 603 return all_results 604 605 @tracer.start_as_current_span("SynapseStorage::getStorageProjects") 606 def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]: 607 """Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute. 608 609 Returns: 610 A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName). 611 """ 612 613 # get the set of all storage Synapse project accessible for this pipeline 614 storageProjects = self.storageFileviewTable["projectId"].unique() 615 616 # get the set of storage Synapse project accessible for this user 617 # get a list of projects from Synapse 618 current_user_project_headers = self.synapse_entity_tracker.get_project_headers( 619 current_user_id=self.syn.credentials.owner_id, syn=self.syn 620 ) 621 project_id_to_name_dict = {} 622 current_user_projects = [] 623 for project_header in current_user_project_headers: 624 project_id_to_name_dict[project_header.get("id")] = project_header.get( 625 "name" 626 ) 627 current_user_projects.append(project_header.get("id")) 628 629 # find set of user projects that are also in this pipeline's storage projects set 630 storageProjects = list(set(storageProjects) & set(current_user_projects)) 631 632 # Limit projects to scope if specified 633 if project_scope: 634 storageProjects = list(set(storageProjects) & set(project_scope)) 635 636 if not storageProjects: 637 raise Warning( 638 f"There are no projects that the user has access to that match the criteria of the specified project scope: {project_scope}" 639 ) 640 641 # prepare a return list of project IDs and names 642 projects = [] 643 for projectId in storageProjects: 644 project_name_from_project_header = project_id_to_name_dict.get(projectId) 645 projects.append((projectId, project_name_from_project_header)) 646 647 sorted_projects_list = sorted(projects, key=lambda tup: tup[0]) 648 649 return sorted_projects_list 650 651 @tracer.start_as_current_span("SynapseStorage::getStorageDatasetsInProject") 652 def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]: 653 """Gets all datasets in folder under a given storage project that the current user has access to. 654 655 Args: 656 projectId: synapse ID of a storage project. 657 658 Returns: 659 A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName). 660 None: If the projectId cannot be found on Synapse. 661 """ 662 663 # select all folders and fetch their names from within the storage project; 664 # if folder content type is defined, only select folders that contain datasets 665 if "contentType" in self.storageFileviewTable.columns: 666 foldersTable = self.storageFileviewTable[ 667 (self.storageFileviewTable["contentType"] == "dataset") 668 & (self.storageFileviewTable["projectId"] == projectId) 669 ] 670 else: 671 foldersTable = self.storageFileviewTable[ 672 (self.storageFileviewTable["type"] == "folder") 673 & (self.storageFileviewTable["parentId"] == projectId) 674 ] 675 676 # get an array of tuples (folderId, folderName) 677 # some folders are part of datasets; others contain datasets 678 # each dataset parent is the project; folders part of a dataset have another folder as a parent 679 # to get folders if and only if they contain datasets for each folder 680 # check if folder's parent is the project; if so that folder contains a dataset, 681 # unless the folder list has already been filtered to dataset folders based on contentType attribute above 682 683 datasetList = [] 684 folderProperties = ["id", "name"] 685 for folder in list( 686 foldersTable[folderProperties].itertuples(index=False, name=None) 687 ): 688 datasetList.append(folder) 689 690 sorted_dataset_list = sorted(datasetList, key=lambda tup: tup[0]) 691 692 return sorted_dataset_list 693 694 @tracer.start_as_current_span("SynapseStorage::getFilesInStorageDataset") 695 def getFilesInStorageDataset( 696 self, datasetId: str, fileNames: List = None, fullpath: bool = True 697 ) -> List[Tuple[str, str]]: 698 """Gets all files (excluding manifest files) in a given dataset folder. 699 700 Args: 701 datasetId: synapse ID of a storage dataset. 702 fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g. 703 metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present. 704 fullpath: if True return the full path as part of this filename; otherwise return just base filename 705 706 Returns: 707 A list of files; the list consists of tuples (fileId, fileName). 708 709 Raises: 710 ValueError: Dataset ID not found. 711 """ 712 file_list = [] 713 714 # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view 715 if self.storageFileviewTable.empty: 716 raise ValueError( 717 f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again." 718 ) 719 child_path = self.storageFileviewTable.loc[ 720 self.storageFileviewTable["parentId"] == datasetId, "path" 721 ] 722 if child_path.empty: 723 raise LookupError( 724 f"Dataset {datasetId} could not be found in fileview {self.storageFileview}." 725 ) 726 child_path = child_path.iloc[0] 727 728 # Get the dataset path by eliminating the child's portion of the path to account for nested datasets 729 parent = child_path.split("/")[:-1] 730 parent = "/".join(parent) 731 732 # When querying, only include files to exclude entity files and subdirectories 733 where_clauses = [create_like_statement(parent), "type='file'"] 734 735 # Requery the fileview to specifically get the files in the given dataset 736 self.query_fileview(columns=["id", "path"], where_clauses=where_clauses) 737 738 # Exclude manifest files 739 non_manifest_files = self.storageFileviewTable.loc[ 740 ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"), 741 :, 742 ] 743 744 # Remove all files that are not in the list of fileNames 745 if fileNames: 746 filename_regex = "|".join(fileNames) 747 748 matching_files = non_manifest_files["path"].str.contains( 749 filename_regex, case=False, regex=True 750 ) 751 752 non_manifest_files = non_manifest_files.loc[matching_files, :] 753 754 # Truncate path if necessary 755 if not fullpath: 756 non_manifest_files.path = non_manifest_files.path.apply(os.path.basename) 757 758 # Return list of files as expected by other methods 759 file_list = list(non_manifest_files.itertuples(index=False, name=None)) 760 761 return file_list 762 763 def _get_manifest_id(self, manifest: pd.DataFrame) -> str: 764 """If both censored and uncensored manifests are present, return uncensored manifest; if only one manifest is present, return manifest id of that manifest; if more than two manifests are present, return the manifest id of the first one. 765 Args: 766 manifest: a dataframe contains name and id of manifests in a given asset view 767 768 Return: 769 manifest_syn_id: id of a given censored or uncensored manifest 770 """ 771 censored_regex = re.compile(".*censored.*") 772 censored = manifest["name"].str.contains(censored_regex) 773 if any(censored): 774 # Try to use uncensored manifest first 775 not_censored = ~censored 776 if any(not_censored): 777 manifest_syn_id = manifest[not_censored]["id"].iloc[0] 778 # if only censored manifests are available, just use the first censored manifest 779 else: 780 manifest_syn_id = manifest["id"].iloc[0] 781 782 # otherwise, use the first (implied only) version that exists 783 else: 784 manifest_syn_id = manifest["id"].iloc[0] 785 786 return manifest_syn_id 787 788 @tracer.start_as_current_span("SynapseStorage::getDatasetManifest") 789 def getDatasetManifest( 790 self, 791 datasetId: str, 792 downloadFile: bool = False, 793 newManifestName: str = "", 794 use_temporary_folder: bool = True, 795 ) -> Union[str, File]: 796 """Gets the manifest associated with a given dataset. 797 798 Args: 799 datasetId: synapse ID of a storage dataset. 800 downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not. 801 newManifestName: new name of a manifest that gets downloaded 802 use_temporary_folder: boolean argument indicating if a temporary folder 803 should be used to store the manifest file. This is useful when running 804 this code as an API server where multiple requests could be made at the 805 same time. This is set to False when the code is being used from the 806 CLI. Defaults to True. 807 808 Returns: 809 manifest_syn_id (String): Synapse ID of exisiting manifest file. 810 manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True. 811 "" (String): No pre-exisiting manifest in dataset. 812 """ 813 manifest_data = "" 814 815 # get a list of files containing the manifest for this dataset (if any) 816 all_files = self.storageFileviewTable 817 818 # construct regex based on manifest basename in the config 819 manifest_re = re.compile(os.path.basename(self.manifest) + ".*.[tc]sv") 820 821 # search manifest based on given manifest basename regex above 822 # and return a dataframe containing name and id of manifests in a given asset view 823 manifest = all_files[ 824 (all_files["name"].str.contains(manifest_re, regex=True)) 825 & (all_files["parentId"] == datasetId) 826 ] 827 828 manifest = manifest[["id", "name"]] 829 830 # if there is no pre-exisiting manifest in the specified dataset 831 if manifest.empty: 832 logger.warning( 833 f"Could not find a manifest that fits basename {self.manifest} in asset view and dataset {datasetId}" 834 ) 835 return "" 836 837 # if there is an exisiting manifest 838 else: 839 manifest_syn_id = self._get_manifest_id(manifest) 840 if downloadFile: 841 md = ManifestDownload( 842 self.syn, 843 manifest_id=manifest_syn_id, 844 synapse_entity_tracker=self.synapse_entity_tracker, 845 ) 846 manifest_data = md.download_manifest( 847 newManifestName=newManifestName, 848 manifest_df=manifest, 849 use_temporary_folder=use_temporary_folder, 850 ) 851 # TO DO: revisit how downstream code handle manifest_data. If the downstream code would break when manifest_data is an empty string, 852 # then we should catch the error here without returning an empty string. 853 if not manifest_data: 854 logger.debug( 855 f"No manifest data returned. Please check if you have successfully downloaded manifest: {manifest_syn_id}" 856 ) 857 return manifest_data 858 return manifest_syn_id 859 860 def getDataTypeFromManifest(self, manifestId: str): 861 """Fetch a manifest and return data types of all columns 862 Args: 863 manifestId: synapse ID of a manifest 864 """ 865 # get manifest file path 866 manifest_entity = self.synapse_entity_tracker.get( 867 synapse_id=manifestId, syn=self.syn, download_file=True 868 ) 869 manifest_filepath = manifest_entity.path 870 871 # load manifest dataframe 872 manifest = load_df( 873 manifest_filepath, 874 preserve_raw_input=False, 875 data_model=False, 876 ) 877 878 # convert the dataFrame to use best possible dtypes. 879 manifest_new = manifest.convert_dtypes() 880 881 # get data types of columns 882 result = manifest_new.dtypes.to_frame("dtypes").reset_index() 883 884 # return the result as a dictionary 885 result_dict = result.set_index("index")["dtypes"].astype(str).to_dict() 886 887 return result_dict 888 889 def _get_files_metadata_from_dataset( 890 self, datasetId: str, only_new_files: bool, manifest: pd.DataFrame = None 891 ) -> Optional[dict]: 892 """retrieve file ids under a particular datasetId 893 894 Args: 895 datasetId (str): a dataset id 896 only_new_files (bool): if only adding new files that are not already exist 897 manifest (pd.DataFrame): metadata manifest dataframe. Default to None. 898 899 Returns: 900 a dictionary that contains filename and entityid under a given datasetId or None if there is nothing under a given dataset id are not available 901 """ 902 dataset_files = self.getFilesInStorageDataset(datasetId) 903 if dataset_files: 904 dataset_file_names_id_dict = self._get_file_entityIds( 905 dataset_files, only_new_files=only_new_files, manifest=manifest 906 ) 907 return dataset_file_names_id_dict 908 else: 909 return None 910 911 def add_entity_id_and_filename( 912 self, datasetId: str, manifest: pd.DataFrame 913 ) -> pd.DataFrame: 914 """add entityid and filename column to an existing manifest assuming entityId column is not already present 915 916 Args: 917 datasetId (str): dataset syn id 918 manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty 919 920 Returns: 921 pd.DataFrame: returns a pandas dataframe 922 """ 923 # get file names and entity ids of a given dataset 924 dataset_files_dict = self._get_files_metadata_from_dataset( 925 datasetId, only_new_files=False 926 ) 927 928 if dataset_files_dict: 929 # turn manifest dataframe back to a dictionary for operation 930 manifest_dict = manifest.to_dict("list") 931 932 # update Filename column 933 # add entityId column to the end 934 manifest_dict.update(dataset_files_dict) 935 936 # if the component column exists in existing manifest, fill up that column 937 if "Component" in manifest_dict.keys(): 938 manifest_dict["Component"] = manifest_dict["Component"] * max( 939 1, len(manifest_dict["Filename"]) 940 ) 941 942 # turn dictionary back to a dataframe 943 manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient="index") 944 manifest_df_updated = manifest_df_index.transpose() 945 946 # fill na with empty string 947 manifest_df_updated = manifest_df_updated.fillna("") 948 949 # drop index 950 manifest_df_updated = manifest_df_updated.reset_index(drop=True) 951 952 return manifest_df_updated 953 else: 954 return manifest 955 956 def fill_in_entity_id_filename( 957 self, datasetId: str, manifest: pd.DataFrame 958 ) -> Tuple[List, pd.DataFrame]: 959 """fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present. 960 961 Args: 962 datasetId (str): dataset syn id 963 manifest (pd.DataFrame): existing manifest dataframe. 964 965 Returns: 966 Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe 967 """ 968 # get dataset file names and entity id as a list of tuple 969 dataset_files = self.getFilesInStorageDataset(datasetId) 970 971 # update manifest with additional filenames, if any 972 # note that if there is an existing manifest and there are files in the dataset 973 # the columns Filename and entityId are assumed to be present in manifest schema 974 # TODO: use idiomatic panda syntax 975 if not dataset_files: 976 manifest = manifest.fillna("") 977 return dataset_files, manifest 978 979 all_files = self._get_file_entityIds( 980 dataset_files=dataset_files, only_new_files=False, manifest=manifest 981 ) 982 new_files = self._get_file_entityIds( 983 dataset_files=dataset_files, only_new_files=True, manifest=manifest 984 ) 985 986 all_files = pd.DataFrame(all_files) 987 new_files = pd.DataFrame(new_files) 988 989 # update manifest so that it contains new dataset files 990 manifest = ( 991 pd.concat([manifest, new_files], sort=False) 992 .reset_index() 993 .drop("index", axis=1) 994 ) 995 996 # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata 997 manifest_reindex = manifest.set_index("entityId") 998 all_files_reindex = all_files.set_index("entityId") 999 all_files_reindex_like_manifest = all_files_reindex.reindex_like( 1000 manifest_reindex 1001 ) 1002 1003 # Check if individual file paths in manifest and from synapse match 1004 file_paths_match = ( 1005 manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"] 1006 ) 1007 1008 # If all the paths do not match, update the manifest with the filepaths from synapse 1009 if not file_paths_match.all(): 1010 manifest_reindex.loc[ 1011 ~file_paths_match, "Filename" 1012 ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"] 1013 1014 # reformat manifest for further use 1015 manifest = manifest_reindex.reset_index() 1016 entityIdCol = manifest.pop("entityId") 1017 manifest.insert(len(manifest.columns), "entityId", entityIdCol) 1018 1019 manifest = manifest.fillna("") 1020 return dataset_files, manifest 1021 1022 @tracer.start_as_current_span("SynapseStorage::updateDatasetManifestFiles") 1023 def updateDatasetManifestFiles( 1024 self, dmge: DataModelGraphExplorer, datasetId: str, store: bool = True 1025 ) -> Union[Tuple[str, pd.DataFrame], None]: 1026 """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any. 1027 1028 Args: 1029 dmge: DataModelGraphExplorer Instance 1030 datasetId: synapse ID of a storage dataset. 1031 store: if set to True store updated manifest in asset store; if set to False 1032 return a Pandas dataframe containing updated manifest but do not store to asset store 1033 1034 1035 Returns: 1036 Synapse ID of updated manifest and Pandas dataframe containing the updated manifest. 1037 If there is no existing manifest or if the manifest does not have an entityId column, return None 1038 """ 1039 1040 # get existing manifest Synapse ID 1041 manifest_id = self.getDatasetManifest(datasetId) 1042 1043 # if there is no manifest return None 1044 if not manifest_id: 1045 return None 1046 1047 manifest_entity = self.synapse_entity_tracker.get( 1048 synapse_id=manifest_id, syn=self.syn, download_file=True 1049 ) 1050 manifest_filepath = manifest_entity.path 1051 manifest = load_df(manifest_filepath) 1052 1053 # If the manifest does not have an entityId column, trigger a new manifest to be generated 1054 if "entityId" not in manifest.columns: 1055 return None 1056 1057 manifest_is_file_based = "Filename" in manifest.columns 1058 1059 if manifest_is_file_based: 1060 # update manifest with additional filenames, if any 1061 # note that if there is an existing manifest and there are files in the dataset 1062 # the columns Filename and entityId are assumed to be present in manifest schema 1063 # TODO: use idiomatic panda syntax 1064 dataset_files, manifest = self.fill_in_entity_id_filename( 1065 datasetId, manifest 1066 ) 1067 if dataset_files: 1068 # update the manifest file, so that it contains the relevant entity IDs 1069 if store: 1070 manifest.to_csv(manifest_filepath, index=False) 1071 1072 # store manifest and update associated metadata with manifest on Synapse 1073 manifest_id = self.associateMetadataWithFiles( 1074 dmge, manifest_filepath, datasetId 1075 ) 1076 1077 return manifest_id, manifest 1078 1079 def _get_file_entityIds( 1080 self, 1081 dataset_files: List, 1082 only_new_files: bool = False, 1083 manifest: pd.DataFrame = None, 1084 ): 1085 """ 1086 Get a dictionary of files in a dataset. Either files that are not in the current manifest or all files 1087 1088 Args: 1089 manifest: metadata manifest 1090 dataset_file: List of all files in a dataset 1091 only_new_files: boolean to control whether only new files are returned or all files in the dataset 1092 Returns: 1093 files: dictionary of file names and entityIDs, with scope as specified by `only_new_files` 1094 """ 1095 files = {"Filename": [], "entityId": []} 1096 1097 if only_new_files: 1098 if manifest is None: 1099 raise UnboundLocalError( 1100 "No manifest was passed in, a manifest is required when `only_new_files` is True." 1101 ) 1102 1103 if "entityId" not in manifest.columns: 1104 raise ValueError( 1105 "The manifest in your dataset and/or top level folder must contain the 'entityId' column. " 1106 "Please generate an empty manifest without annotations, manually add annotations to the " 1107 "appropriate files in the manifest, and then try again." 1108 ) 1109 1110 # find new files (that are not in the current manifest) if any 1111 for file_id, file_name in dataset_files: 1112 if not file_id in manifest["entityId"].values: 1113 files["Filename"].append(file_name) 1114 files["entityId"].append(file_id) 1115 else: 1116 # get all files 1117 for file_id, file_name in dataset_files: 1118 files["Filename"].append(file_name) 1119 files["entityId"].append(file_id) 1120 1121 return files 1122 1123 @tracer.start_as_current_span("SynapseStorage::getProjectManifests") 1124 def getProjectManifests( 1125 self, projectId: str 1126 ) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]: 1127 """Gets all metadata manifest files across all datasets in a specified project. 1128 1129 Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest 1130 as a list of tuples, one for each manifest: 1131 [ 1132 ( 1133 (datasetId, dataName), 1134 (manifestId, manifestName), 1135 (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema 1136 ), 1137 ... 1138 ] 1139 1140 TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface 1141 """ 1142 component = None 1143 entity = None 1144 manifests = [] 1145 1146 datasets = self.getStorageDatasetsInProject(projectId) 1147 1148 for datasetId, datasetName in datasets: 1149 # encode information about the manifest in a simple list (so that R clients can unpack it) 1150 # eventually can serialize differently 1151 1152 # Get synID of manifest for a dataset 1153 manifestId = self.getDatasetManifest(datasetId) 1154 1155 # If a manifest exists, get the annotations for it, else return base 'manifest' tuple 1156 if manifestId: 1157 annotations = self.getFileAnnotations(manifestId) 1158 1159 # If manifest has annotations specifying component, use that 1160 if annotations and "Component" in annotations: 1161 component = annotations["Component"] 1162 entity = self.synapse_entity_tracker.get( 1163 synapse_id=manifestId, syn=self.syn, download_file=False 1164 ) 1165 manifest_name = entity["properties"]["name"] 1166 1167 # otherwise download the manifest and parse for information 1168 elif not annotations or "Component" not in annotations: 1169 logging.debug( 1170 f"No component annotations have been found for manifest {manifestId}. " 1171 "The manifest will be downloaded and parsed instead. " 1172 "For increased speed, add component annotations to manifest." 1173 ) 1174 1175 manifest_info = self.getDatasetManifest( 1176 datasetId, downloadFile=True 1177 ) 1178 manifest_name = manifest_info["properties"].get("name", "") 1179 1180 if not manifest_name: 1181 logger.error(f"Failed to download manifests from {datasetId}") 1182 1183 manifest_path = manifest_info["path"] 1184 1185 manifest_df = load_df(manifest_path) 1186 1187 # Get component from component column if it exists 1188 if ( 1189 "Component" in manifest_df 1190 and not manifest_df["Component"].empty 1191 ): 1192 list(set(manifest_df["Component"])) 1193 component = list(set(manifest_df["Component"])) 1194 1195 # Added to address issues raised during DCA testing 1196 if "" in component: 1197 component.remove("") 1198 1199 if len(component) == 1: 1200 component = component[0] 1201 elif len(component) > 1: 1202 logging.warning( 1203 f"Manifest {manifestId} is composed of multiple components. Schematic does not support mulit-component manifests at this time." 1204 "Behavior of manifests with multiple components is undefined" 1205 ) 1206 else: 1207 manifest_name = "" 1208 component = None 1209 if component: 1210 manifest = ( 1211 (datasetId, datasetName), 1212 (manifestId, manifest_name), 1213 (component, component), 1214 ) 1215 elif manifestId: 1216 logging.debug( 1217 f"Manifest {manifestId} does not have an associated Component" 1218 ) 1219 manifest = ( 1220 (datasetId, datasetName), 1221 (manifestId, manifest_name), 1222 ("", ""), 1223 ) 1224 else: 1225 manifest = ( 1226 (datasetId, datasetName), 1227 ("", ""), 1228 ("", ""), 1229 ) 1230 1231 if manifest: 1232 manifests.append(manifest) 1233 1234 return manifests 1235 1236 def upload_project_manifests_to_synapse( 1237 self, dmge: DataModelGraphExplorer, projectId: str 1238 ) -> List[str]: 1239 """Upload all metadata manifest files across all datasets in a specified project as tables in Synapse. 1240 1241 Returns: String of all the manifest_table_ids of all the manifests that have been loaded. 1242 """ 1243 1244 manifests = [] 1245 manifest_loaded = [] 1246 datasets = self.getStorageDatasetsInProject(projectId) 1247 1248 for datasetId, datasetName in datasets: 1249 # encode information about the manifest in a simple list (so that R clients can unpack it) 1250 # eventually can serialize differently 1251 1252 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1253 1254 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1255 if manifest_info: 1256 manifest_id = manifest_info["properties"]["id"] 1257 manifest_name = manifest_info["properties"]["name"] 1258 manifest_path = manifest_info["path"] 1259 manifest_df = load_df(manifest_path) 1260 manifest_table_id = uploadDB( 1261 dmge=dmge, 1262 manifest=manifest, 1263 datasetId=datasetId, 1264 table_name=datasetName, 1265 ) 1266 manifest_loaded.append(datasetName) 1267 return manifest_loaded 1268 1269 def upload_annotated_project_manifests_to_synapse( 1270 self, projectId: str, path_to_json_ld: str, dry_run: bool = False 1271 ) -> List[str]: 1272 """ 1273 Purpose: 1274 For all manifests in a project, upload them as a table and add annotations manifest csv. 1275 Assumes the manifest is already present as a CSV in a dataset in the project. 1276 1277 """ 1278 # Instantiate DataModelParser 1279 data_model_parser = DataModelParser(path_to_data_model=path_to_json_ld) 1280 # Parse Model 1281 parsed_data_model = data_model_parser.parse_model() 1282 1283 # Instantiate DataModelGraph 1284 data_model_grapher = DataModelGraph(parsed_data_model) 1285 1286 # Generate graph 1287 graph_data_model = data_model_grapher.generate_data_model_graph() 1288 1289 # Instantiate DataModelGraphExplorer 1290 dmge = DataModelGraphExplorer(graph_data_model) 1291 1292 manifests = [] 1293 manifest_loaded = [] 1294 datasets = self.getStorageDatasetsInProject(projectId) 1295 for datasetId, datasetName in datasets: 1296 # encode information about the manifest in a simple list (so that R clients can unpack it) 1297 # eventually can serialize differently 1298 1299 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1300 manifests.append(manifest) 1301 1302 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1303 1304 if manifest_info: 1305 manifest_id = manifest_info["properties"]["id"] 1306 manifest_name = manifest_info["properties"]["name"] 1307 manifest_path = manifest_info["path"] 1308 manifest = ( 1309 (datasetId, datasetName), 1310 (manifest_id, manifest_name), 1311 ("", ""), 1312 ) 1313 if not dry_run: 1314 self.associateMetadataWithFiles( 1315 dmge, manifest_path, datasetId, manifest_record_type="table" 1316 ) 1317 manifest_loaded.append(manifest) 1318 1319 return manifests, manifest_loaded 1320 1321 def move_entities_to_new_project( 1322 self, 1323 projectId: str, 1324 newProjectId: str, 1325 returnEntities: bool = False, 1326 dry_run: bool = False, 1327 ): 1328 """ 1329 For each manifest csv in a project, look for all the entitiy ids that are associated. 1330 Look up the entitiy in the files, move the entity to new project. 1331 """ 1332 1333 manifests = [] 1334 manifest_loaded = [] 1335 datasets = self.getStorageDatasetsInProject(projectId) 1336 if datasets: 1337 for datasetId, datasetName in datasets: 1338 # encode information about the manifest in a simple list (so that R clients can unpack it) 1339 # eventually can serialize differently 1340 1341 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1342 manifests.append(manifest) 1343 1344 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1345 if manifest_info: 1346 manifest_id = manifest_info["properties"]["id"] 1347 manifest_name = manifest_info["properties"]["name"] 1348 manifest_path = manifest_info["path"] 1349 manifest_df = load_df(manifest_path) 1350 1351 manifest = ( 1352 (datasetId, datasetName), 1353 (manifest_id, manifest_name), 1354 ("", ""), 1355 ) 1356 manifest_loaded.append(manifest) 1357 1358 annotation_entities = self.storageFileviewTable[ 1359 (self.storageFileviewTable["id"].isin(manifest_df["entityId"])) 1360 & (self.storageFileviewTable["type"] == "folder") 1361 ]["id"] 1362 1363 if returnEntities: 1364 for entityId in annotation_entities: 1365 if not dry_run: 1366 moved_entity = self.syn.move(entityId, datasetId) 1367 self.synapse_entity_tracker.add( 1368 synapse_id=moved_entity.id, entity=moved_entity 1369 ) 1370 else: 1371 logging.info( 1372 f"{entityId} will be moved to folder {datasetId}." 1373 ) 1374 else: 1375 # generate project folder 1376 archive_project_folder = Folder( 1377 projectId + "_archive", parent=newProjectId 1378 ) 1379 archive_project_folder = self.syn.store(archive_project_folder) 1380 self.synapse_entity_tracker.add( 1381 synapse_id=archive_project_folder.id, 1382 entity=archive_project_folder, 1383 ) 1384 1385 # generate dataset folder 1386 dataset_archive_folder = Folder( 1387 "_".join([datasetId, datasetName, "archive"]), 1388 parent=archive_project_folder.id, 1389 ) 1390 dataset_archive_folder = self.syn.store(dataset_archive_folder) 1391 self.synapse_entity_tracker.add( 1392 synapse_id=dataset_archive_folder.id, 1393 entity=dataset_archive_folder, 1394 ) 1395 1396 for entityId in annotation_entities: 1397 # move entities to folder 1398 if not dry_run: 1399 moved_entity = self.syn.move( 1400 entityId, dataset_archive_folder.id 1401 ) 1402 self.synapse_entity_tracker.add( 1403 synapse_id=moved_entity.id, entity=moved_entity 1404 ) 1405 else: 1406 logging.info( 1407 f"{entityId} will be moved to folder {dataset_archive_folder.id}." 1408 ) 1409 else: 1410 raise LookupError( 1411 f"No datasets were found in the specified project: {projectId}. Re-check specified master_fileview in CONFIG and retry." 1412 ) 1413 return manifests, manifest_loaded 1414 1415 @tracer.start_as_current_span("SynapseStorage::get_synapse_table") 1416 def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable]: 1417 """Download synapse table as a pd dataframe; return table schema and etags as results too 1418 1419 Args: 1420 synapse_id: synapse ID of the table to query 1421 """ 1422 1423 results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id)) 1424 df = results.asDataFrame( 1425 rowIdAndVersionInIndex=False, 1426 na_values=STR_NA_VALUES_FILTERED, 1427 keep_default_na=False, 1428 ) 1429 1430 return df, results 1431 1432 @missing_entity_handler 1433 @tracer.start_as_current_span("SynapseStorage::uploadDB") 1434 def uploadDB( 1435 self, 1436 dmge: DataModelGraphExplorer, 1437 manifest: pd.DataFrame, 1438 datasetId: str, 1439 table_name: str, 1440 restrict: bool = False, 1441 table_manipulation: str = "replace", 1442 table_column_names: str = "class_label", 1443 ): 1444 """ 1445 Method to upload a database to an asset store. In synapse, this will upload a metadata table 1446 1447 Args: 1448 dmge: DataModelGraphExplorer object 1449 manifest: pd.Df manifest to upload 1450 datasetId: synID of the dataset for the manifest 1451 table_name: name of the table to be uploaded 1452 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 1453 existingTableId: str of the synId of the existing table, if one already exists 1454 table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert) 1455 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 1456 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 1457 display label formatting. 1458 Returns: 1459 manifest_table_id: synID of the uploaded table 1460 manifest: the original manifset 1461 table_manifest: manifest formatted appropriately for the table 1462 1463 """ 1464 1465 col_schema, table_manifest = self.formatDB( 1466 dmge=dmge, manifest=manifest, table_column_names=table_column_names 1467 ) 1468 1469 manifest_table_id = self.buildDB( 1470 datasetId, 1471 table_name, 1472 col_schema, 1473 table_manifest, 1474 table_manipulation, 1475 dmge, 1476 restrict, 1477 ) 1478 1479 return manifest_table_id, manifest, table_manifest 1480 1481 @tracer.start_as_current_span("SynapseStorage::formatDB") 1482 def formatDB(self, dmge, manifest, table_column_names): 1483 """ 1484 Method to format a manifest appropriatly for upload as table 1485 1486 Args: 1487 dmge: DataModelGraphExplorer object 1488 manifest: pd.Df manifest to upload 1489 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 1490 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 1491 display label formatting. 1492 Returns: 1493 col_schema: schema for table columns: type, size, etc 1494 table_manifest: formatted manifest 1495 1496 """ 1497 # Rename the manifest columns to display names to match fileview 1498 1499 blacklist_chars = ["(", ")", ".", " ", "-"] 1500 manifest_columns = manifest.columns.tolist() 1501 1502 table_manifest = deepcopy(manifest) 1503 1504 if table_column_names == "display_name": 1505 cols = table_manifest.columns 1506 1507 elif table_column_names == "display_label": 1508 cols = [ 1509 str(col).translate({ord(x): "" for x in blacklist_chars}) 1510 for col in manifest_columns 1511 ] 1512 1513 elif table_column_names == "class_label": 1514 cols = [ 1515 get_class_label_from_display_name(str(col)).translate( 1516 {ord(x): "" for x in blacklist_chars} 1517 ) 1518 for col in manifest_columns 1519 ] 1520 else: 1521 ValueError( 1522 f"The provided table_column_name: {table_column_names} is not valid, please resubmit with an allowed value only." 1523 ) 1524 1525 cols = list(map(lambda x: x.replace("EntityId", "entityId"), cols)) 1526 1527 # Reset column names in table manifest 1528 table_manifest.columns = cols 1529 1530 # move entity id to end of df 1531 entity_col = table_manifest.pop("entityId") 1532 table_manifest.insert(len(table_manifest.columns), "entityId", entity_col) 1533 1534 # Get the column schema 1535 col_schema = as_table_columns(table_manifest) 1536 1537 # Set Id column length to 64 (for some reason not being auto set.) 1538 for i, col in enumerate(col_schema): 1539 if col["name"].lower() == "id": 1540 col_schema[i]["maximumSize"] = 64 1541 1542 return col_schema, table_manifest 1543 1544 @tracer.start_as_current_span("SynapseStorage::buildDB") 1545 def buildDB( 1546 self, 1547 datasetId: str, 1548 table_name: str, 1549 col_schema: List, 1550 table_manifest: pd.DataFrame, 1551 table_manipulation: str, 1552 dmge: DataModelGraphExplorer, 1553 restrict: bool = False, 1554 ): 1555 """ 1556 Method to construct the table appropriately: create new table, replace existing, or upsert new into existing 1557 Calls TableOperations class to execute 1558 1559 Args: 1560 datasetId: synID of the dataset for the manifest 1561 table_name: name of the table to be uploaded 1562 col_schema: schema for table columns: type, size, etc from `formatDB` 1563 table_manifest: formatted manifest that can be uploaded as a table 1564 table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert) 1565 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 1566 1567 Returns: 1568 manifest_table_id: synID of the uploaded table 1569 1570 """ 1571 table_parent_id = self.getDatasetProject(datasetId=datasetId) 1572 existing_table_id = self.syn.findEntityId( 1573 name=table_name, parent=table_parent_id 1574 ) 1575 tableOps = TableOperations( 1576 synStore=self, 1577 tableToLoad=table_manifest, 1578 tableName=table_name, 1579 datasetId=datasetId, 1580 existingTableId=existing_table_id, 1581 restrict=restrict, 1582 synapse_entity_tracker=self.synapse_entity_tracker, 1583 ) 1584 1585 if not table_manipulation or existing_table_id is None: 1586 manifest_table_id = tableOps.createTable( 1587 columnTypeDict=col_schema, 1588 specifySchema=True, 1589 ) 1590 elif existing_table_id is not None: 1591 if table_manipulation.lower() == "replace": 1592 manifest_table_id = tableOps.replaceTable( 1593 specifySchema=True, 1594 columnTypeDict=col_schema, 1595 ) 1596 elif table_manipulation.lower() == "upsert": 1597 manifest_table_id = tableOps.upsertTable( 1598 dmge=dmge, 1599 ) 1600 elif table_manipulation.lower() == "update": 1601 manifest_table_id = tableOps.updateTable() 1602 1603 if table_manipulation and table_manipulation.lower() == "upsert": 1604 table_entity = self.synapse_entity_tracker.get( 1605 synapse_id=existing_table_id or manifest_table_id, 1606 syn=self.syn, 1607 download_file=False, 1608 ) 1609 annos = OldAnnotations( 1610 id=table_entity.id, 1611 etag=table_entity.etag, 1612 values=table_entity.annotations, 1613 ) 1614 annos["primary_key"] = table_manifest["Component"][0] + "_id" 1615 annos = self.syn.set_annotations(annos) 1616 table_entity.etag = annos.etag 1617 table_entity.annotations = annos 1618 1619 return manifest_table_id 1620 1621 @tracer.start_as_current_span("SynapseStorage::upload_manifest_file") 1622 def upload_manifest_file( 1623 self, 1624 manifest, 1625 metadataManifestPath, 1626 datasetId, 1627 restrict_manifest, 1628 component_name="", 1629 ): 1630 # Update manifest to have the new entityId column 1631 manifest.to_csv(metadataManifestPath, index=False) 1632 1633 # store manifest to Synapse as a CSV 1634 # update file name 1635 file_name_full = metadataManifestPath.split("/")[-1] 1636 file_extension = file_name_full.split(".")[-1] 1637 1638 # Differentiate "censored" and "uncensored" manifest 1639 if "censored" in file_name_full: 1640 file_name_new = ( 1641 os.path.basename(CONFIG.synapse_manifest_basename) 1642 + "_" 1643 + component_name 1644 + "_censored" 1645 + "." 1646 + file_extension 1647 ) 1648 else: 1649 file_name_new = ( 1650 os.path.basename(CONFIG.synapse_manifest_basename) 1651 + "_" 1652 + component_name 1653 + "." 1654 + file_extension 1655 ) 1656 1657 manifest_synapse_file = None 1658 try: 1659 # Rename the file to file_name_new then revert 1660 # This is to maintain the original file name in-case other code is 1661 # expecting that the file exists with the original name 1662 original_file_path = metadataManifestPath 1663 new_file_path = os.path.join( 1664 os.path.dirname(metadataManifestPath), file_name_new 1665 ) 1666 os.rename(original_file_path, new_file_path) 1667 1668 manifest_synapse_file = self._store_file_for_manifest_upload( 1669 new_file_path=new_file_path, 1670 dataset_id=datasetId, 1671 existing_file_name=file_name_full, 1672 file_name_new=file_name_new, 1673 restrict_manifest=restrict_manifest, 1674 ) 1675 manifest_synapse_file_id = manifest_synapse_file.id 1676 1677 finally: 1678 # Revert the file name back to the original 1679 os.rename(new_file_path, original_file_path) 1680 1681 if manifest_synapse_file: 1682 manifest_synapse_file.path = original_file_path 1683 1684 return manifest_synapse_file_id 1685 1686 def _store_file_for_manifest_upload( 1687 self, 1688 new_file_path: str, 1689 dataset_id: str, 1690 existing_file_name: str, 1691 file_name_new: str, 1692 restrict_manifest: bool, 1693 ) -> File: 1694 """Handles a create or update of a manifest file that is going to be uploaded. 1695 If we already have a copy of the Entity in memory we will update that instance, 1696 otherwise create a new File instance to be created in Synapse. Once stored 1697 this will add the file to the `synapse_entity_tracker` for future reference. 1698 1699 Args: 1700 new_file_path (str): The path to the new manifest file 1701 dataset_id (str): The Synapse ID of the dataset the manifest is associated with 1702 existing_file_name (str): The name of the existing file 1703 file_name_new (str): The name of the new file 1704 restrict_manifest (bool): Whether the manifest should be restricted 1705 1706 Returns: 1707 File: The stored manifest file 1708 """ 1709 local_tracked_file_instance = ( 1710 self.synapse_entity_tracker.search_local_by_parent_and_name( 1711 name=existing_file_name, parent_id=dataset_id 1712 ) 1713 or self.synapse_entity_tracker.search_local_by_parent_and_name( 1714 name=file_name_new, parent_id=dataset_id 1715 ) 1716 ) 1717 1718 if local_tracked_file_instance: 1719 local_tracked_file_instance.path = new_file_path 1720 local_tracked_file_instance.description = ( 1721 "Manifest for dataset " + dataset_id 1722 ) 1723 manifest_synapse_file = local_tracked_file_instance 1724 else: 1725 manifest_synapse_file = File( 1726 path=new_file_path, 1727 description="Manifest for dataset " + dataset_id, 1728 parent=dataset_id, 1729 name=file_name_new, 1730 ) 1731 1732 manifest_synapse_file = self.syn.store( 1733 manifest_synapse_file, isRestricted=restrict_manifest 1734 ) 1735 1736 self.synapse_entity_tracker.add( 1737 synapse_id=manifest_synapse_file.id, entity=manifest_synapse_file 1738 ) 1739 return manifest_synapse_file 1740 1741 async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]: 1742 """get annotations asynchronously 1743 1744 Args: 1745 synapse_id (str): synapse id of the entity that the annotation belongs 1746 1747 Returns: 1748 Dict[str, Any]: The requested entity bundle matching 1749 <https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html> 1750 """ 1751 return await get_entity_id_bundle2( 1752 entity_id=synapse_id, 1753 request={"includeAnnotations": True}, 1754 synapse_client=self.syn, 1755 ) 1756 1757 async def store_async_annotation(self, annotation_dict: dict) -> Annotations: 1758 """store annotation in an async way 1759 1760 Args: 1761 annotation_dict (dict): annotation in a dictionary format 1762 1763 Returns: 1764 Annotations: The stored annotations. 1765 """ 1766 annotation_data = Annotations.from_dict( 1767 synapse_annotations=annotation_dict["annotations"]["annotations"] 1768 ) 1769 annotation_class = Annotations( 1770 annotations=annotation_data, 1771 etag=annotation_dict["annotations"]["etag"], 1772 id=annotation_dict["annotations"]["id"], 1773 ) 1774 annotation_storage_result = await annotation_class.store_async( 1775 synapse_client=self.syn 1776 ) 1777 local_entity = self.synapse_entity_tracker.get( 1778 synapse_id=annotation_dict["annotations"]["id"], 1779 syn=self.syn, 1780 download_file=False, 1781 retrieve_if_not_present=False, 1782 ) 1783 if local_entity: 1784 local_entity.etag = annotation_storage_result.etag 1785 local_entity.annotations = annotation_storage_result 1786 return annotation_storage_result 1787 1788 def process_row_annotations( 1789 self, 1790 dmge: DataModelGraphExplorer, 1791 metadata_syn: Dict[str, Any], 1792 hide_blanks: bool, 1793 csv_list_regex: str, 1794 annos: Dict[str, Any], 1795 annotation_keys: str, 1796 ) -> Dict[str, Any]: 1797 """Processes metadata annotations based on the logic below: 1798 1. Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is: 1799 An empty or whitespace-only string. 1800 A NaN value (if the annotation is a float). 1801 if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key. 1802 if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key. 1803 1804 2. If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name". 1805 Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key. 1806 1807 3. For any other conditions, assigns the original value of anno_v to the annotation key (anno_k). 1808 1809 4. Returns the updated annotations dictionary. 1810 1811 Args: 1812 dmge (DataModelGraphExplorer): data model graph explorer 1813 metadata_syn (dict): metadata used for Synapse storage 1814 hideBlanks (bool): if true, does not upload annotation keys with blank values. 1815 csv_list_regex (str): Regex to match with comma separated list 1816 annos (Dict[str, Any]): dictionary of annotation returned from synapse 1817 annotation_keys (str): display_label/class_label 1818 1819 Returns: 1820 Dict[str, Any]: annotations as a dictionary 1821 1822 ```mermaid 1823 flowchart TD 1824 A[Start] --> C{Is anno_v empty, whitespace, or NaN?} 1825 C -- Yes --> D{Is hide_blanks True?} 1826 D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing] 1827 D -- No --> F[Assign empty string to annotation key] 1828 C -- No --> G{Is anno_v a string?} 1829 G -- No --> H[Assign original value of anno_v to annotation key] 1830 G -- Yes --> I{Does anno_v match csv_list_regex?} 1831 I -- Yes --> J[Get validation rule of anno_k] 1832 J --> K{Does the validation rule contain 'list'} 1833 K -- Yes --> L[Split anno_v by commas and assign as list] 1834 I -- No --> H 1835 K -- No --> H 1836 ``` 1837 """ 1838 for anno_k, anno_v in metadata_syn.items(): 1839 # Remove keys with nan or empty string values or string that only contains white space from dict of annotations to be uploaded 1840 # if present on current data annotation 1841 if hide_blanks and ( 1842 (isinstance(anno_v, str) and anno_v.strip() == "") 1843 or (isinstance(anno_v, float) and np.isnan(anno_v)) 1844 ): 1845 annos["annotations"]["annotations"].pop(anno_k) if anno_k in annos[ 1846 "annotations" 1847 ]["annotations"].keys() else annos["annotations"]["annotations"] 1848 continue 1849 1850 # Otherwise save annotation as approrpriate 1851 if isinstance(anno_v, float) and np.isnan(anno_v): 1852 annos["annotations"]["annotations"][anno_k] = "" 1853 continue 1854 1855 # Handle strings that match the csv_list_regex and pass the validation rule 1856 if isinstance(anno_v, str) and re.fullmatch(csv_list_regex, anno_v): 1857 # Use a dictionary to dynamically choose the argument 1858 param = ( 1859 {"node_display_name": anno_k} 1860 if annotation_keys == "display_label" 1861 else {"node_label": anno_k} 1862 ) 1863 node_validation_rules = dmge.get_node_validation_rules(**param) 1864 1865 if rule_in_rule_list("list", node_validation_rules): 1866 annos["annotations"]["annotations"][anno_k] = anno_v.split(",") 1867 continue 1868 # default: assign the original value 1869 annos["annotations"]["annotations"][anno_k] = anno_v 1870 1871 return annos 1872 1873 @async_missing_entity_handler 1874 async def format_row_annotations( 1875 self, 1876 dmge: DataModelGraphExplorer, 1877 row: pd.Series, 1878 entityId: str, 1879 hideBlanks: bool, 1880 annotation_keys: str, 1881 ) -> Union[None, Dict[str, Any]]: 1882 """Format row annotations 1883 1884 Args: 1885 dmge (DataModelGraphExplorer): data moodel graph explorer object 1886 row (pd.Series): row of the manifest 1887 entityId (str): entity id of the manifest 1888 hideBlanks (bool): when true, does not upload annotation keys with blank values. When false, upload Annotation keys with empty string values 1889 annotation_keys (str): display_label/class_label 1890 1891 Returns: 1892 Union[None, Dict[str,]]: if entity id is in trash can, return None. Otherwise, return the annotations 1893 """ 1894 # prepare metadata for Synapse storage (resolve display name into a name that Synapse annotations support (e.g no spaces, parenthesis) 1895 # note: the removal of special characters, will apply only to annotation keys; we are not altering the manifest 1896 # this could create a divergence between manifest column and annotations. this should be ok for most use cases. 1897 # columns with special characters are outside of the schema 1898 metadataSyn = {} 1899 blacklist_chars = ["(", ")", ".", " ", "-"] 1900 1901 for k, v in row.to_dict().items(): 1902 if annotation_keys == "display_label": 1903 keySyn = str(k).translate({ord(x): "" for x in blacklist_chars}) 1904 elif annotation_keys == "class_label": 1905 keySyn = get_class_label_from_display_name(str(k)).translate( 1906 {ord(x): "" for x in blacklist_chars} 1907 ) 1908 1909 # Skip `Filename` and `ETag` columns when setting annotations 1910 if keySyn in ["Filename", "ETag", "eTag"]: 1911 continue 1912 1913 # truncate annotation values to 500 characters if the 1914 # size of values is greater than equal to 500 characters 1915 # add an explicit [truncatedByDataCuratorApp] message at the end 1916 # of every truncated message to indicate that the cell value 1917 # has been truncated 1918 if isinstance(v, str) and len(v) >= 500: 1919 v = v[0:472] + "[truncatedByDataCuratorApp]" 1920 1921 metadataSyn[keySyn] = v 1922 1923 # This will first check if the entity is already in memory, and if so, that 1924 # instance is used. Unfortunately, the expected return format needs to match 1925 # the Synapse API, so we need to convert the annotations to the expected format. 1926 entity = self.synapse_entity_tracker.get( 1927 synapse_id=entityId, 1928 syn=self.syn, 1929 download_file=False, 1930 retrieve_if_not_present=False, 1931 ) 1932 if entity is not None: 1933 synapse_annotations = _convert_to_annotations_list( 1934 annotations=entity.annotations 1935 ) 1936 annos = { 1937 "annotations": { 1938 "id": entity.id, 1939 "etag": entity.etag, 1940 "annotations": synapse_annotations, 1941 } 1942 } 1943 else: 1944 annos = await self.get_async_annotation(entityId) 1945 1946 # set annotation(s) for the various objects/items in a dataset on Synapse 1947 csv_list_regex = comma_separated_list_regex() 1948 1949 annos = self.process_row_annotations( 1950 dmge=dmge, 1951 metadata_syn=metadataSyn, 1952 hide_blanks=hideBlanks, 1953 csv_list_regex=csv_list_regex, 1954 annos=annos, 1955 annotation_keys=annotation_keys, 1956 ) 1957 1958 return annos 1959 1960 @missing_entity_handler 1961 @tracer.start_as_current_span("SynapseStorage::format_manifest_annotations") 1962 def format_manifest_annotations(self, manifest, manifest_synapse_id): 1963 """ 1964 Set annotations for the manifest (as a whole) so they can be applied to the manifest table or csv. 1965 For now just getting the Component. 1966 """ 1967 1968 entity = self.synapse_entity_tracker.get( 1969 synapse_id=manifest_synapse_id, syn=self.syn, download_file=False 1970 ) 1971 is_file = entity.concreteType.endswith(".FileEntity") 1972 is_table = entity.concreteType.endswith(".TableEntity") 1973 1974 if is_file: 1975 # Get file metadata 1976 metadata = self.getFileAnnotations(manifest_synapse_id) 1977 1978 # If there is a defined component add it to the metadata. 1979 if "Component" in manifest.columns: 1980 # Gather component information 1981 component = manifest["Component"].unique() 1982 1983 # Double check that only a single component is listed, else raise an error. 1984 try: 1985 len(component) == 1 1986 except ValueError as err: 1987 raise ValueError( 1988 f"Manifest has more than one component. Please check manifest and resubmit." 1989 ) from err 1990 1991 # Add component to metadata 1992 metadata["Component"] = component[0] 1993 1994 elif is_table: 1995 # Get table metadata 1996 metadata = self.getTableAnnotations(manifest_synapse_id) 1997 1998 # Get annotations 1999 annos = OldAnnotations( 2000 id=entity.id, etag=entity.etag, values=entity.annotations 2001 ) 2002 2003 # Add metadata to the annotations 2004 for annos_k, annos_v in metadata.items(): 2005 annos[annos_k] = annos_v 2006 return annos 2007 2008 ''' 2009 def annotate_upload_manifest_table(self, manifest, datasetId, metadataManifestPath, 2010 useSchemaLabel: bool = True, hideBlanks: bool = False, restrict_manifest = False): 2011 """ 2012 Purpose: 2013 Works very similarly to associateMetadataWithFiles except takes in the manifest 2014 rather than the manifest path 2015 2016 """ 2017 2018 # Add uuid for table updates and fill. 2019 if not "Uuid" in manifest.columns: 2020 manifest["Uuid"] = '' 2021 2022 for idx,row in manifest.iterrows(): 2023 if not row["Uuid"]: 2024 gen_uuid = uuid.uuid4() 2025 row["Uuid"] = gen_uuid 2026 manifest.loc[idx, 'Uuid'] = gen_uuid 2027 2028 # add entityId as a column if not already there or 2029 # fill any blanks with an empty string. 2030 if not "entityId" in manifest.columns: 2031 manifest["entityId"] = "" 2032 else: 2033 manifest["entityId"].fillna("", inplace=True) 2034 2035 # get a DataModelGraphExplorer object to ensure schema attribute names used in manifest are translated to schema labels for synapse annotations 2036 dmge = DataModelGraphExplorer() 2037 2038 # Create table name here. 2039 if 'Component' in manifest.columns: 2040 table_name = manifest['Component'][0].lower() + '_synapse_storage_manifest_table' 2041 else: 2042 table_name = 'synapse_storage_manifest_table' 2043 2044 # Upload manifest as a table and get the SynID and manifest 2045 manifest_synapse_table_id, manifest, table_manifest = self.upload_format_manifest_table( 2046 dmge, manifest, datasetId, table_name, restrict = restrict_manifest, useSchemaLabel=useSchemaLabel,) 2047 2048 # Iterate over manifest rows, create Synapse entities and store corresponding entity IDs in manifest if needed 2049 # also set metadata for each synapse entity as Synapse annotations 2050 for idx, row in manifest.iterrows(): 2051 if not row["entityId"]: 2052 # If not using entityIds, fill with manifest_table_id so 2053 row["entityId"] = manifest_synapse_table_id 2054 entityId = '' 2055 else: 2056 # get the entity id corresponding to this row 2057 entityId = row["entityId"] 2058 2059 # Load manifest to synapse as a CSV File 2060 manifest_synapse_file_id = self.upload_manifest_file(manifest, metadataManifestPath, datasetId, restrict_manifest) 2061 2062 # Get annotations for the file manifest. 2063 manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_file_id) 2064 2065 self.syn.set_annotations(manifest_annotations) 2066 2067 logger.info("Associated manifest file with dataset on Synapse.") 2068 2069 # Update manifest Synapse table with new entity id column. 2070 self.make_synapse_table( 2071 table_to_load = table_manifest, 2072 dataset_id = datasetId, 2073 existingTableId = manifest_synapse_table_id, 2074 table_name = table_name, 2075 update_col = 'Uuid', 2076 specify_schema = False, 2077 ) 2078 2079 # Get annotations for the table manifest 2080 manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_table_id) 2081 self.syn.set_annotations(manifest_annotations) 2082 return manifest_synapse_table_id 2083 ''' 2084 2085 def _read_manifest(self, metadataManifestPath: str) -> pd.DataFrame: 2086 """Helper function to read in provided manifest as a pandas DataFrame for subsequent downstream processing. 2087 Args: 2088 metadataManifestPath (str): path where manifest is stored 2089 Returns: 2090 manifest(pd.DataFrame): Manifest loaded as a pandas dataframe 2091 Raises: 2092 FileNotFoundError: Manifest file does not exist at provided path. 2093 """ 2094 # read new manifest csv 2095 try: 2096 load_args = { 2097 "dtype": "string", 2098 } 2099 manifest = load_df( 2100 metadataManifestPath, 2101 preserve_raw_input=False, 2102 allow_na_values=False, 2103 **load_args, 2104 ) 2105 except FileNotFoundError as err: 2106 raise FileNotFoundError( 2107 f"No manifest file was found at this path: {metadataManifestPath}" 2108 ) from err 2109 return manifest 2110 2111 def _add_id_columns_to_manifest( 2112 self, manifest: pd.DataFrame, dmge: DataModelGraphExplorer 2113 ) -> pd.DataFrame: 2114 """ 2115 Ensures that the manifest DataFrame has standardized 'Id' and 'entityId' columns. 2116 2117 - If any case variation of the 'id' column is present (e.g., 'id', 'ID', 'iD'), it is renamed to 'Id'. 2118 - If any case variation of the 'entityid' column is present, it is renamed to 'entityId'. 2119 - If any case variation of the 'uuid' column is present, it is renamed to 'uuid' before further processing. 2120 - If 'Id' is still missing: 2121 - It will be created as an empty column, or 2122 - Derived from a 'Uuid' column, depending on whether 'uuid' is defined in the schema. 2123 - If both 'uuid' and 'Id' columns exist, the 'uuid' column is dropped. 2124 - Missing values in the 'Id' column are filled with generated UUIDs. 2125 - If 'entityId' is still missing, it will be created and filled with empty strings. 2126 - If 'entityId' is already present, any missing values will be replaced with empty strings. 2127 2128 Args: 2129 manifest (pd.DataFrame): The metadata manifest to be updated. 2130 dmge (DataModelGraphExplorer): Data model graph explorer object. 2131 2132 Returns: 2133 pd.DataFrame: The updated manifest with a standardized 'Id' column and an 'entityId' column. 2134 """ 2135 2136 # Normalize any variation of 'id' to 'Id', "entityid" to "entityId", "Uuid" to "uuid" 2137 for col in manifest.columns: 2138 if col.lower() == "id": 2139 manifest = manifest.rename(columns={col: ID_COLUMN}) 2140 if col.lower() == "entityid": 2141 manifest = manifest.rename(columns={col: ENTITY_ID_COLUMN}) 2142 if col.lower() == "uuid": 2143 manifest = manifest.rename(columns={col: UUID_COLUMN}) 2144 2145 # If 'Id' still doesn't exist, see if uuid column exists 2146 # Rename uuid column to "Id" column 2147 if ID_COLUMN not in manifest.columns: 2148 # See if schema has `Uuid` column specified 2149 try: 2150 uuid_col_in_schema = dmge.is_class_in_schema( 2151 "Uuid" 2152 ) or dmge.is_class_in_schema("uuid") 2153 except KeyError: 2154 uuid_col_in_schema = False 2155 2156 # Rename `uuid` column if it wasn't specified in the schema 2157 if UUID_COLUMN in manifest.columns and not uuid_col_in_schema: 2158 manifest = manifest.rename(columns={UUID_COLUMN: ID_COLUMN}) 2159 # If no `uuid` column exists or it is specified in the schema, create a new `Id` column 2160 else: 2161 manifest[ID_COLUMN] = "" 2162 else: 2163 # 'Id' already exists, ignore 'uuid' 2164 if UUID_COLUMN in manifest.columns: 2165 manifest = manifest.drop(columns=[UUID_COLUMN]) 2166 2167 # Fill in UUIDs in the "Id" column if missing 2168 for idx, row in manifest.iterrows(): 2169 if not row["Id"]: 2170 gen_uuid = str(uuid.uuid4()) 2171 row["Id"] = gen_uuid 2172 manifest.loc[idx, ID_COLUMN] = gen_uuid 2173 2174 # Add entityId as a column if not already there 2175 if ENTITY_ID_COLUMN not in manifest: 2176 manifest[ENTITY_ID_COLUMN] = "" 2177 else: 2178 manifest[ENTITY_ID_COLUMN] = manifest[ENTITY_ID_COLUMN].fillna("") 2179 2180 return manifest 2181 2182 def _generate_table_name(self, manifest): 2183 """Helper function to generate a table name for upload to synapse. 2184 2185 Args: 2186 Manifest loaded as a pd.Dataframe 2187 2188 Returns: 2189 table_name (str): Name of the table to load 2190 component_name (str): Name of the manifest component (if applicable) 2191 """ 2192 # Create table name here. 2193 if "Component" in manifest.columns: 2194 component_name = manifest["Component"][0].lower() 2195 table_name = component_name + "_synapse_storage_manifest_table" 2196 else: 2197 component_name = "" 2198 table_name = "synapse_storage_manifest_table" 2199 return table_name, component_name 2200 2201 def _create_entity_id(self, idx, row, manifest, datasetId): 2202 """Helper function to generate an entityId and add it to the appropriate row in the manifest. 2203 Args: 2204 row: current row of manifest being processed 2205 manifest (pd.DataFrame): loaded df containing user supplied data. 2206 datasetId (str): synapse ID of folder containing the dataset 2207 2208 Returns: 2209 manifest (pd.DataFrame): manifest with entityId added to the appropriate row 2210 entityId (str): Generated Entity Id. 2211 2212 """ 2213 rowEntity = Folder(str(uuid.uuid4()), parent=datasetId) 2214 rowEntity = self.syn.store(rowEntity) 2215 entityId = rowEntity["id"] 2216 self.synapse_entity_tracker.add(synapse_id=entityId, entity=rowEntity) 2217 row["entityId"] = entityId 2218 manifest.loc[idx, "entityId"] = entityId 2219 return manifest, entityId 2220 2221 async def _process_store_annos(self, requests: Set[asyncio.Task]) -> None: 2222 """Process annotations and store them on synapse asynchronously 2223 2224 Args: 2225 requests (Set[asyncio.Task]): a set of tasks of formatting annotations created by format_row_annotations function in previous step 2226 2227 Raises: 2228 RuntimeError: raise a run time error if a task failed to complete 2229 """ 2230 while requests: 2231 done_tasks, pending_tasks = await asyncio.wait( 2232 requests, return_when=asyncio.FIRST_COMPLETED 2233 ) 2234 requests = pending_tasks 2235 2236 for completed_task in done_tasks: 2237 try: 2238 annos = completed_task.result() 2239 2240 if isinstance(annos, Annotations): 2241 logger.info(f"Successfully stored annotations for {annos.id}") 2242 else: 2243 # store annotations if they are not None 2244 if annos: 2245 entity_id = annos["annotations"]["id"] 2246 logger.info( 2247 f"Obtained and processed annotations for {entity_id} entity" 2248 ) 2249 requests.add( 2250 asyncio.create_task( 2251 self.store_async_annotation(annotation_dict=annos) 2252 ) 2253 ) 2254 except Exception as e: 2255 raise RuntimeError(f"failed with { repr(e) }.") from e 2256 2257 @tracer.start_as_current_span("SynapseStorage::add_annotations_to_entities_files") 2258 async def add_annotations_to_entities_files( 2259 self, 2260 dmge, 2261 manifest, 2262 manifest_record_type: str, 2263 datasetId: str, 2264 hideBlanks: bool, 2265 manifest_synapse_table_id="", 2266 annotation_keys: str = "class_label", 2267 ): 2268 """ 2269 Depending on upload type add Ids to entityId row. Add anotations to connected 2270 files and folders. Despite the name of this function, it also applies to folders. 2271 2272 Args: 2273 dmge: DataModelGraphExplorer Object 2274 manifest (pd.DataFrame): loaded df containing user supplied data. 2275 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2276 datasetId (str): synapse ID of folder containing the dataset 2277 hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2278 manifest_synapse_table_id (str): Default is an empty string ''. 2279 annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display 2280 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2281 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2282 Returns: 2283 manifest (pd.DataFrame): modified to add entitiyId as appropriate 2284 2285 """ 2286 2287 # Expected behavior is to annotate files if `Filename` is present and if file_annotations_upload is set to True regardless of `-mrt` setting 2288 if "filename" in [col.lower() for col in manifest.columns]: 2289 # get current list of files and store as dataframe 2290 dataset_files = self.getFilesInStorageDataset(datasetId) 2291 files_and_entityIds = self._get_file_entityIds( 2292 dataset_files=dataset_files, only_new_files=False 2293 ) 2294 file_df = pd.DataFrame(files_and_entityIds) 2295 2296 # Merge dataframes to add entityIds 2297 manifest = manifest.merge( 2298 file_df, how="left", on="Filename", suffixes=["_x", None] 2299 ).drop("entityId_x", axis=1) 2300 2301 # Fill `entityId` for each row if missing and annotate entity as appropriate 2302 requests = set() 2303 for idx, row in manifest.iterrows(): 2304 if not row["entityId"] and ( 2305 manifest_record_type == "file_and_entities" 2306 or manifest_record_type == "table_file_and_entities" 2307 ): 2308 manifest, entityId = self._create_entity_id( 2309 idx, row, manifest, datasetId 2310 ) 2311 elif not row["entityId"] and manifest_record_type == "table_and_file": 2312 # If not using entityIds, fill with manifest_table_id so 2313 row["entityId"] = manifest_synapse_table_id 2314 manifest.loc[idx, "entityId"] = manifest_synapse_table_id 2315 entityId = "" 2316 # If the row is the manifest table, do not add annotations 2317 elif row["entityId"] == manifest_synapse_table_id: 2318 entityId = "" 2319 else: 2320 # get the file id of the file to annotate, collected in above step. 2321 entityId = row["entityId"] 2322 2323 # Adding annotations to connected files. 2324 if entityId: 2325 # Format annotations for Synapse 2326 annos_task = asyncio.create_task( 2327 self.format_row_annotations( 2328 dmge, row, entityId, hideBlanks, annotation_keys 2329 ) 2330 ) 2331 requests.add(annos_task) 2332 await self._process_store_annos(requests) 2333 return manifest 2334 2335 @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_table") 2336 def upload_manifest_as_table( 2337 self, 2338 dmge: DataModelGraphExplorer, 2339 manifest: pd.DataFrame, 2340 metadataManifestPath: str, 2341 datasetId: str, 2342 table_name: str, 2343 component_name: str, 2344 restrict: bool, 2345 manifest_record_type: str, 2346 hideBlanks: bool, 2347 table_manipulation: str, 2348 table_column_names: str, 2349 annotation_keys: str, 2350 file_annotations_upload: bool = True, 2351 ): 2352 """Upload manifest to Synapse as a table and csv. 2353 Args: 2354 dmge: DataModelGraphExplorer object 2355 manifest (pd.DataFrame): loaded df containing user supplied data. 2356 metadataManifestPath: path to csv containing a validated metadata manifest. 2357 datasetId (str): synapse ID of folder containing the dataset 2358 table_name (str): Generated to name the table being uploaded. 2359 component_name (str): Name of the component manifest that is currently being uploaded. 2360 restrict (bool): Flag for censored data. 2361 manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2362 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2363 table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2364 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2365 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2366 display label formatting. 2367 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2368 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2369 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2370 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2371 Return: 2372 manifest_synapse_file_id: SynID of manifest csv uploaded to synapse. 2373 """ 2374 # Upload manifest as a table, get the ID and updated manifest. 2375 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2376 dmge=dmge, 2377 manifest=manifest, 2378 datasetId=datasetId, 2379 table_name=table_name, 2380 restrict=restrict, 2381 table_manipulation=table_manipulation, 2382 table_column_names=table_column_names, 2383 ) 2384 2385 if file_annotations_upload: 2386 manifest = asyncio.run( 2387 self.add_annotations_to_entities_files( 2388 dmge, 2389 manifest, 2390 manifest_record_type, 2391 datasetId, 2392 hideBlanks, 2393 manifest_synapse_table_id, 2394 annotation_keys, 2395 ) 2396 ) 2397 # Load manifest to synapse as a CSV File 2398 manifest_synapse_file_id = self.upload_manifest_file( 2399 manifest=manifest, 2400 metadataManifestPath=metadataManifestPath, 2401 datasetId=datasetId, 2402 restrict_manifest=restrict, 2403 component_name=component_name, 2404 ) 2405 2406 # Set annotations for the file manifest. 2407 manifest_annotations = self.format_manifest_annotations( 2408 manifest=manifest, manifest_synapse_id=manifest_synapse_file_id 2409 ) 2410 annos = self.syn.set_annotations(annotations=manifest_annotations) 2411 manifest_entity = self.synapse_entity_tracker.get( 2412 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2413 ) 2414 manifest_entity.annotations = annos 2415 manifest_entity.etag = annos.etag 2416 2417 logger.info("Associated manifest file with dataset on Synapse.") 2418 2419 # Update manifest Synapse table with new entity id column. 2420 manifest_synapse_table_id, manifest, _ = self.uploadDB( 2421 dmge=dmge, 2422 manifest=manifest, 2423 datasetId=datasetId, 2424 table_name=table_name, 2425 restrict=restrict, 2426 table_manipulation="update", 2427 table_column_names=table_column_names, 2428 ) 2429 2430 # Set annotations for the table manifest 2431 manifest_annotations = self.format_manifest_annotations( 2432 manifest=manifest, manifest_synapse_id=manifest_synapse_table_id 2433 ) 2434 annotations_manifest_table = self.syn.set_annotations( 2435 annotations=manifest_annotations 2436 ) 2437 manifest_table_entity = self.synapse_entity_tracker.get( 2438 synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False 2439 ) 2440 manifest_table_entity.annotations = annotations_manifest_table 2441 manifest_table_entity.etag = annotations_manifest_table.etag 2442 2443 return manifest_synapse_file_id 2444 2445 @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_csv") 2446 def upload_manifest_as_csv( 2447 self, 2448 dmge, 2449 manifest, 2450 metadataManifestPath, 2451 datasetId, 2452 restrict, 2453 manifest_record_type, 2454 hideBlanks, 2455 component_name, 2456 annotation_keys: str, 2457 file_annotations_upload: bool = True, 2458 ): 2459 """Upload manifest to Synapse as a csv only. 2460 Args: 2461 dmge: DataModelGraphExplorer object 2462 manifest (pd.DataFrame): loaded df containing user supplied data. 2463 metadataManifestPath: path to csv containing a validated metadata manifest. 2464 datasetId (str): synapse ID of folder containing the dataset 2465 restrict (bool): Flag for censored data. 2466 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2467 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2468 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2469 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2470 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2471 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2472 Return: 2473 manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. 2474 """ 2475 if file_annotations_upload: 2476 manifest = asyncio.run( 2477 self.add_annotations_to_entities_files( 2478 dmge, 2479 manifest, 2480 manifest_record_type, 2481 datasetId, 2482 hideBlanks, 2483 annotation_keys=annotation_keys, 2484 ) 2485 ) 2486 2487 # Load manifest to synapse as a CSV File 2488 manifest_synapse_file_id = self.upload_manifest_file( 2489 manifest, 2490 metadataManifestPath, 2491 datasetId, 2492 restrict, 2493 component_name=component_name, 2494 ) 2495 2496 # Set annotations for the file manifest. 2497 manifest_annotations = self.format_manifest_annotations( 2498 manifest, manifest_synapse_file_id 2499 ) 2500 annos = self.syn.set_annotations(manifest_annotations) 2501 manifest_entity = self.synapse_entity_tracker.get( 2502 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2503 ) 2504 manifest_entity.annotations = annos 2505 manifest_entity.etag = annos.etag 2506 2507 logger.info("Associated manifest file with dataset on Synapse.") 2508 2509 return manifest_synapse_file_id 2510 2511 @tracer.start_as_current_span("SynapseStorage::upload_manifest_combo") 2512 def upload_manifest_combo( 2513 self, 2514 dmge, 2515 manifest, 2516 metadataManifestPath, 2517 datasetId, 2518 table_name, 2519 component_name, 2520 restrict, 2521 manifest_record_type, 2522 hideBlanks, 2523 table_manipulation, 2524 table_column_names: str, 2525 annotation_keys: str, 2526 file_annotations_upload: bool = True, 2527 ): 2528 """Upload manifest to Synapse as a table and CSV with entities. 2529 Args: 2530 dmge: DataModelGraphExplorer object 2531 manifest (pd.DataFrame): loaded df containing user supplied data. 2532 metadataManifestPath: path to csv containing a validated metadata manifest. 2533 datasetId (str): synapse ID of folder containing the dataset 2534 table_name (str): Generated to name the table being uploaded. 2535 component_name (str): Name of the component manifest that is currently being uploaded. 2536 restrict (bool): Flag for censored data. 2537 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2538 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2539 table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2540 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2541 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2542 display label formatting. 2543 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2544 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2545 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2546 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2547 Return: 2548 manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. 2549 """ 2550 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2551 dmge=dmge, 2552 manifest=manifest, 2553 datasetId=datasetId, 2554 table_name=table_name, 2555 restrict=restrict, 2556 table_manipulation=table_manipulation, 2557 table_column_names=table_column_names, 2558 ) 2559 2560 if file_annotations_upload: 2561 manifest = asyncio.run( 2562 self.add_annotations_to_entities_files( 2563 dmge, 2564 manifest, 2565 manifest_record_type, 2566 datasetId, 2567 hideBlanks, 2568 manifest_synapse_table_id, 2569 annotation_keys=annotation_keys, 2570 ) 2571 ) 2572 2573 # Load manifest to synapse as a CSV File 2574 manifest_synapse_file_id = self.upload_manifest_file( 2575 manifest, metadataManifestPath, datasetId, restrict, component_name 2576 ) 2577 2578 # Set annotations for the file manifest. 2579 manifest_annotations = self.format_manifest_annotations( 2580 manifest, manifest_synapse_file_id 2581 ) 2582 file_manifest_annoations = self.syn.set_annotations(manifest_annotations) 2583 manifest_entity = self.synapse_entity_tracker.get( 2584 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2585 ) 2586 manifest_entity.annotations = file_manifest_annoations 2587 manifest_entity.etag = file_manifest_annoations.etag 2588 logger.info("Associated manifest file with dataset on Synapse.") 2589 2590 # Update manifest Synapse table with new entity id column. 2591 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2592 dmge=dmge, 2593 manifest=manifest, 2594 datasetId=datasetId, 2595 table_name=table_name, 2596 restrict=restrict, 2597 table_manipulation="update", 2598 table_column_names=table_column_names, 2599 ) 2600 2601 # Set annotations for the table manifest 2602 manifest_annotations = self.format_manifest_annotations( 2603 manifest, manifest_synapse_table_id 2604 ) 2605 table_manifest_annotations = self.syn.set_annotations(manifest_annotations) 2606 manifest_entity = self.synapse_entity_tracker.get( 2607 synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False 2608 ) 2609 manifest_entity.annotations = table_manifest_annotations 2610 manifest_entity.etag = table_manifest_annotations.etag 2611 return manifest_synapse_file_id 2612 2613 @tracer.start_as_current_span("SynapseStorage::associateMetadataWithFiles") 2614 def associateMetadataWithFiles( 2615 self, 2616 dmge: DataModelGraphExplorer, 2617 metadataManifestPath: str, 2618 datasetId: str, 2619 manifest_record_type: str = "table_file_and_entities", 2620 hideBlanks: bool = False, 2621 restrict_manifest=False, 2622 table_manipulation: str = "replace", 2623 table_column_names: str = "class_label", 2624 annotation_keys: str = "class_label", 2625 file_annotations_upload: bool = True, 2626 ) -> str: 2627 """Associate metadata with files in a storage dataset already on Synapse. 2628 Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file. 2629 2630 If this is a new manifest there could be no Synapse entities associated with the rows of this manifest 2631 this may be due to data type (e.g. clinical data) being tabular 2632 and not requiring files; to utilize uniform interfaces downstream 2633 (i.e. fileviews), a Synapse entity (a folder) is created for each row 2634 and an entity column is added to the manifest containing the resulting 2635 entity IDs; a table is also created at present as an additional interface 2636 for downstream query and interaction with the data. 2637 2638 Args: 2639 dmge: DataModelGraphExplorer Object 2640 metadataManifestPath: path to csv containing a validated metadata manifest. 2641 The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type. 2642 Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item. 2643 In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file. 2644 datasetId: synapse ID of folder containing the dataset 2645 manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination. 2646 hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2647 restrict_manifest (bool): Default is false. Flag for censored data. 2648 table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2649 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2650 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2651 display label formatting. 2652 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2653 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2654 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2655 Returns: 2656 manifest_synapse_file_id: SynID of manifest csv uploaded to synapse. 2657 """ 2658 # Read new manifest CSV: 2659 manifest = self._read_manifest(metadataManifestPath) 2660 manifest = self._add_id_columns_to_manifest(manifest, dmge) 2661 2662 table_name, component_name = self._generate_table_name(manifest) 2663 2664 # Upload manifest to synapse based on user input (manifest_record_type) 2665 if manifest_record_type == "file_only": 2666 manifest_synapse_file_id = self.upload_manifest_as_csv( 2667 dmge=dmge, 2668 manifest=manifest, 2669 metadataManifestPath=metadataManifestPath, 2670 datasetId=datasetId, 2671 restrict=restrict_manifest, 2672 hideBlanks=hideBlanks, 2673 manifest_record_type=manifest_record_type, 2674 component_name=component_name, 2675 annotation_keys=annotation_keys, 2676 file_annotations_upload=file_annotations_upload, 2677 ) 2678 elif manifest_record_type == "table_and_file": 2679 manifest_synapse_file_id = self.upload_manifest_as_table( 2680 dmge=dmge, 2681 manifest=manifest, 2682 metadataManifestPath=metadataManifestPath, 2683 datasetId=datasetId, 2684 table_name=table_name, 2685 component_name=component_name, 2686 restrict=restrict_manifest, 2687 hideBlanks=hideBlanks, 2688 manifest_record_type=manifest_record_type, 2689 table_manipulation=table_manipulation, 2690 table_column_names=table_column_names, 2691 annotation_keys=annotation_keys, 2692 file_annotations_upload=file_annotations_upload, 2693 ) 2694 elif manifest_record_type == "file_and_entities": 2695 manifest_synapse_file_id = self.upload_manifest_as_csv( 2696 dmge=dmge, 2697 manifest=manifest, 2698 metadataManifestPath=metadataManifestPath, 2699 datasetId=datasetId, 2700 restrict=restrict_manifest, 2701 hideBlanks=hideBlanks, 2702 manifest_record_type=manifest_record_type, 2703 component_name=component_name, 2704 annotation_keys=annotation_keys, 2705 file_annotations_upload=file_annotations_upload, 2706 ) 2707 elif manifest_record_type == "table_file_and_entities": 2708 manifest_synapse_file_id = self.upload_manifest_combo( 2709 dmge=dmge, 2710 manifest=manifest, 2711 metadataManifestPath=metadataManifestPath, 2712 datasetId=datasetId, 2713 table_name=table_name, 2714 component_name=component_name, 2715 restrict=restrict_manifest, 2716 hideBlanks=hideBlanks, 2717 manifest_record_type=manifest_record_type, 2718 table_manipulation=table_manipulation, 2719 table_column_names=table_column_names, 2720 annotation_keys=annotation_keys, 2721 file_annotations_upload=file_annotations_upload, 2722 ) 2723 else: 2724 raise ValueError("Please enter a valid manifest_record_type.") 2725 return manifest_synapse_file_id 2726 2727 def getTableAnnotations(self, table_id: str): 2728 """Generate dictionary of annotations for the given Synapse file. 2729 Synapse returns all custom annotations as lists since they 2730 can contain multiple values. In all cases, the values will 2731 be converted into strings and concatenated with ", ". 2732 2733 Args: 2734 fileId (str): Synapse ID for dataset file. 2735 2736 Returns: 2737 dict: Annotations as comma-separated strings. 2738 """ 2739 try: 2740 entity = self.synapse_entity_tracker.get( 2741 synapse_id=table_id, syn=self.syn, download_file=False 2742 ) 2743 is_table = entity.concreteType.endswith(".TableEntity") 2744 annotations_raw = entity.annotations 2745 except SynapseHTTPError: 2746 # If an error occurs with retrieving entity, skip it 2747 # This could be caused by a temporary file view that 2748 # was deleted since its ID was retrieved 2749 is_file, is_table = False, False 2750 2751 # Skip anything that isn't a file or folder 2752 if not (is_table): 2753 return None 2754 2755 annotations = self.getEntityAnnotations(table_id, entity, annotations_raw) 2756 2757 return annotations 2758 2759 def getFileAnnotations(self, fileId: str) -> Dict[str, str]: 2760 """Generate dictionary of annotations for the given Synapse file. 2761 Synapse returns all custom annotations as lists since they 2762 can contain multiple values. In all cases, the values will 2763 be converted into strings and concatenated with ", ". 2764 2765 Args: 2766 fileId (str): Synapse ID for dataset file. 2767 2768 Returns: 2769 dict: Annotations as comma-separated strings. 2770 """ 2771 2772 # Get entity metadata, including annotations 2773 try: 2774 entity = self.synapse_entity_tracker.get( 2775 synapse_id=fileId, syn=self.syn, download_file=False 2776 ) 2777 is_file = entity.concreteType.endswith(".FileEntity") 2778 is_folder = entity.concreteType.endswith(".Folder") 2779 annotations_raw = entity.annotations 2780 except SynapseHTTPError: 2781 # If an error occurs with retrieving entity, skip it 2782 # This could be caused by a temporary file view that 2783 # was deleted since its ID was retrieved 2784 is_file, is_folder = False, False 2785 2786 # Skip anything that isn't a file or folder 2787 if not (is_file or is_folder): 2788 return None 2789 2790 annotations = self.getEntityAnnotations(fileId, entity, annotations_raw) 2791 2792 return annotations 2793 2794 def getEntityAnnotations(self, fileId, entity, annotations_raw): 2795 # Extract annotations from their lists and stringify. For example: 2796 # {'YearofBirth': [1980], 'author': ['bruno', 'milen', 'sujay']} 2797 annotations = dict() 2798 for key, vals in annotations_raw.items(): 2799 if isinstance(vals, list) and len(vals) == 1: 2800 annotations[key] = str(vals[0]) 2801 else: 2802 annotations[key] = ", ".join(str(v) for v in vals) 2803 2804 # Add the file entity ID and eTag, which weren't lists 2805 assert fileId == entity.id, ( 2806 "For some reason, the Synapse ID in the response doesn't match" 2807 "the Synapse ID sent in the request (via synapseclient)." 2808 ) 2809 annotations["entityId"] = fileId 2810 annotations["eTag"] = entity.etag 2811 2812 return annotations 2813 2814 def getDatasetAnnotations( 2815 self, datasetId: str, fill_na: bool = True, force_batch: bool = False 2816 ) -> pd.DataFrame: 2817 """Generate table for annotations across all files in given dataset. 2818 2819 Args: 2820 datasetId (str): Synapse ID for dataset folder. 2821 fill_na (bool): Whether to replace missing values with 2822 blank strings. 2823 force_batch (bool): Whether to force the function to use 2824 the batch mode, which uses a file view to retrieve 2825 annotations for a given dataset. Default to False 2826 unless there are more than 50 files in the dataset. 2827 2828 Returns: 2829 pd.DataFrame: Table of annotations. 2830 """ 2831 # Get all files in given dataset 2832 dataset_files = self.getFilesInStorageDataset(datasetId) 2833 2834 # if there are no dataset files, there are no annotations 2835 # return None 2836 if not dataset_files: 2837 return pd.DataFrame() 2838 2839 dataset_files_map = dict(dataset_files) 2840 dataset_file_ids, _ = list(zip(*dataset_files)) 2841 2842 # Get annotations for each file from Step 1 2843 # Batch mode 2844 try_batch = len(dataset_files) >= 50 or force_batch 2845 if try_batch: 2846 try: 2847 logger.info("Trying batch mode for retrieving Synapse annotations") 2848 table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids) 2849 except (SynapseAuthenticationError, SynapseHTTPError, ValueError): 2850 logger.info( 2851 f"Unable to create a temporary file view bound to {datasetId}. " 2852 "Defaulting to slower iterative retrieval of annotations." 2853 ) 2854 # Default to the slower non-batch method 2855 logger.info("Batch mode failed (probably due to permission error)") 2856 try_batch = False 2857 2858 # Non-batch mode 2859 if not try_batch: 2860 logger.info("Using slower (non-batch) sequential mode") 2861 records = [self.getFileAnnotations(i) for i in dataset_file_ids] 2862 # Remove any annotations for non-file/folders (stored as None) 2863 records = filter(None, records) 2864 table = pd.DataFrame.from_records(records) 2865 2866 # Add filenames for the files that "survived" annotation retrieval 2867 filenames = [dataset_files_map[i] for i in table["entityId"]] 2868 2869 if "Filename" not in table.columns: 2870 table.insert(0, "Filename", filenames) 2871 2872 # Ensure that entityId and eTag are at the end 2873 entity_ids = table.pop("entityId") 2874 etags = table.pop("eTag") 2875 table.insert(len(table.columns), "entityId", entity_ids) 2876 table.insert(len(table.columns), "eTag", etags) 2877 2878 # Missing values are filled in with empty strings for Google Sheets 2879 if fill_na: 2880 table.fillna("", inplace=True) 2881 2882 # Force all values as strings 2883 return table.astype(str) 2884 2885 def raise_final_error(retry_state): 2886 return retry_state.outcome.result() 2887 2888 def checkIfinAssetView(self, syn_id) -> str: 2889 # get data in administrative fileview for this pipeline 2890 assetViewTable = self.getStorageFileviewTable() 2891 all_files = list(assetViewTable["id"]) 2892 if syn_id in all_files: 2893 return True 2894 else: 2895 return False 2896 2897 @tracer.start_as_current_span("SynapseStorage::getDatasetProject") 2898 @retry( 2899 stop=stop_after_attempt(5), 2900 wait=wait_chain( 2901 *[wait_fixed(10) for i in range(2)] 2902 + [wait_fixed(15) for i in range(2)] 2903 + [wait_fixed(20)] 2904 ), 2905 retry=retry_if_exception_type(LookupError), 2906 retry_error_callback=raise_final_error, 2907 ) 2908 def getDatasetProject(self, datasetId: str) -> str: 2909 """Get parent project for a given dataset ID. 2910 2911 Args: 2912 datasetId (str): Synapse entity ID (folder or project). 2913 2914 Raises: 2915 ValueError: Raised if Synapse ID cannot be retrieved 2916 by the user or if it doesn't appear in the file view. 2917 2918 Returns: 2919 str: The Synapse ID for the parent project. 2920 """ 2921 2922 # Subset main file view 2923 dataset_index = self.storageFileviewTable["id"] == datasetId 2924 dataset_row = self.storageFileviewTable[dataset_index] 2925 2926 # re-query if no datasets found 2927 if dataset_row.empty: 2928 sleep(5) 2929 self.query_fileview(force_requery=True) 2930 # Subset main file view 2931 dataset_index = self.storageFileviewTable["id"] == datasetId 2932 dataset_row = self.storageFileviewTable[dataset_index] 2933 2934 # Return `projectId` for given row if only one found 2935 if len(dataset_row) == 1: 2936 dataset_project = dataset_row["projectId"].values[0] 2937 return dataset_project 2938 2939 # Otherwise, check if already project itself 2940 try: 2941 syn_object = self.synapse_entity_tracker.get( 2942 synapse_id=datasetId, syn=self.syn, download_file=False 2943 ) 2944 if syn_object.properties["concreteType"].endswith("Project"): 2945 return datasetId 2946 except SynapseHTTPError: 2947 raise PermissionError( 2948 f"The given dataset ({datasetId}) isn't accessible with this " 2949 "user. This might be caused by a typo in the dataset Synapse ID." 2950 ) 2951 2952 # If not, then assume dataset not in file view 2953 raise LookupError( 2954 f"The given dataset ({datasetId}) doesn't appear in the " 2955 f"configured file view ({self.storageFileview}). This might " 2956 "mean that the file view's scope needs to be updated." 2957 ) 2958 2959 def getDatasetAnnotationsBatch( 2960 self, datasetId: str, dataset_file_ids: Sequence[str] = None 2961 ) -> pd.DataFrame: 2962 """Generate table for annotations across all files in given dataset. 2963 This function uses a temporary file view to generate a table 2964 instead of iteratively querying for individual entity annotations. 2965 This function is expected to run much faster than 2966 `self.getDatasetAnnotationsBatch` on large datasets. 2967 2968 Args: 2969 datasetId (str): Synapse ID for dataset folder. 2970 dataset_file_ids (Sequence[str]): List of Synapse IDs 2971 for dataset files/folders used to subset the table. 2972 2973 Returns: 2974 pd.DataFrame: Table of annotations. 2975 """ 2976 # Create data frame from annotations file view 2977 with DatasetFileView(datasetId, self.syn) as fileview: 2978 table = fileview.query() 2979 2980 if dataset_file_ids: 2981 table = table.loc[table.index.intersection(dataset_file_ids)] 2982 2983 table = table.reset_index(drop=True) 2984 2985 return table 2986 2987 def _get_table_schema_by_cname(self, table_schema): 2988 # assume no duplicate column names in the table 2989 table_schema_by_cname = {} 2990 2991 for col_record in table_schema: 2992 # TODO clean up dictionary for compactness (e.g. remove redundant 'name' key) 2993 table_schema_by_cname[col_record["name"]] = col_record 2994 2995 return table_schema_by_cname 2996 2997 2998class TableOperations: 2999 """ 3000 Object to hold functions for various table operations specific to the Synapse Asset Store. 3001 3002 Currently implement operations are: 3003 createTable: upload a manifest as a new table when none exist 3004 replaceTable: replace a metadata in a table from one manifest with metadata from another manifest 3005 updateTable: add a column to a table that already exists on synapse 3006 3007 Operations currently in development are: 3008 upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest 3009 """ 3010 3011 def __init__( 3012 self, 3013 synStore: SynapseStorage, 3014 tableToLoad: pd.DataFrame = None, 3015 tableName: str = None, 3016 datasetId: str = None, 3017 existingTableId: str = None, 3018 restrict: bool = False, 3019 synapse_entity_tracker: SynapseEntityTracker = None, 3020 ): 3021 """ 3022 Class governing table operations (creation, replacement, upserts, updates) in schematic 3023 3024 tableToLoad: manifest formatted appropriately for the table 3025 tableName: name of the table to be uploaded 3026 datasetId: synID of the dataset for the manifest 3027 existingTableId: synId of the table currently exising on synapse (if there is one) 3028 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 3029 synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities 3030 3031 """ 3032 self.synStore = synStore 3033 self.tableToLoad = tableToLoad 3034 self.tableName = tableName 3035 self.datasetId = datasetId 3036 self.existingTableId = existingTableId 3037 self.restrict = restrict 3038 self.synapse_entity_tracker = synapse_entity_tracker or SynapseEntityTracker() 3039 3040 @tracer.start_as_current_span("TableOperations::createTable") 3041 def createTable( 3042 self, 3043 columnTypeDict: dict = None, 3044 specifySchema: bool = True, 3045 ): 3046 """ 3047 Method to create a table from a metadata manifest and upload it to synapse 3048 3049 Args: 3050 columnTypeDict: dictionary schema for table columns: type, size, etc 3051 specifySchema: to specify a specific schema for the table format 3052 3053 Returns: 3054 table.schema.id: synID of the newly created table 3055 """ 3056 datasetEntity = self.synapse_entity_tracker.get( 3057 synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False 3058 ) 3059 datasetName = datasetEntity.name 3060 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3061 3062 if not self.tableName: 3063 self.tableName = datasetName + "table" 3064 datasetParentProject = self.synStore.getDatasetProject(self.datasetId) 3065 if specifySchema: 3066 if columnTypeDict == {}: 3067 logger.error("Did not provide a columnTypeDict.") 3068 # create list of columns: 3069 cols = [] 3070 for col in self.tableToLoad.columns: 3071 if col in table_schema_by_cname: 3072 col_type = table_schema_by_cname[col]["columnType"] 3073 max_size = ( 3074 table_schema_by_cname[col]["maximumSize"] 3075 if "maximumSize" in table_schema_by_cname[col].keys() 3076 else 100 3077 ) 3078 max_list_len = 250 3079 if max_size and max_list_len: 3080 cols.append( 3081 Column( 3082 name=col, 3083 columnType=col_type, 3084 maximumSize=max_size, 3085 maximumListLength=max_list_len, 3086 ) 3087 ) 3088 elif max_size: 3089 cols.append( 3090 Column(name=col, columnType=col_type, maximumSize=max_size) 3091 ) 3092 else: 3093 cols.append(Column(name=col, columnType=col_type)) 3094 else: 3095 # TODO add warning that the given col was not found and it's max size is set to 100 3096 cols.append(Column(name=col, columnType="STRING", maximumSize=100)) 3097 schema = Schema( 3098 name=self.tableName, columns=cols, parent=datasetParentProject 3099 ) 3100 table = Table(schema, self.tableToLoad) 3101 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3102 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3103 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3104 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3105 return table.schema.id 3106 else: 3107 # For just uploading the tables to synapse using default 3108 # column types. 3109 table = build_table(self.tableName, datasetParentProject, self.tableToLoad) 3110 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3111 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3112 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3113 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3114 return table.schema.id 3115 3116 @tracer.start_as_current_span("TableOperations::replaceTable") 3117 def replaceTable( 3118 self, 3119 specifySchema: bool = True, 3120 columnTypeDict: dict = None, 3121 ): 3122 """ 3123 Method to replace an existing table on synapse with metadata from a new manifest 3124 3125 Args: 3126 specifySchema: to infer a schema for the table format 3127 columnTypeDict: dictionary schema for table columns: type, size, etc 3128 3129 Returns: 3130 existingTableId: synID of the already existing table that had its metadata replaced 3131 """ 3132 datasetEntity = self.synapse_entity_tracker.get( 3133 synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False 3134 ) 3135 3136 datasetName = datasetEntity.name 3137 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3138 existing_table, existing_results = self.synStore.get_synapse_table( 3139 self.existingTableId 3140 ) 3141 # remove rows 3142 self.synStore.syn.delete(existing_results) 3143 # Data changes such as removing all rows causes the eTag to change. 3144 self.synapse_entity_tracker.remove(synapse_id=self.existingTableId) 3145 # wait for row deletion to finish on synapse before getting empty table 3146 sleep(10) 3147 3148 # removes all current columns 3149 current_table = self.synapse_entity_tracker.get( 3150 synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False 3151 ) 3152 3153 current_columns = self.synStore.syn.getTableColumns(current_table) 3154 3155 for col in current_columns: 3156 current_table.removeColumn(col) 3157 3158 if not self.tableName: 3159 self.tableName = datasetName + "table" 3160 3161 # Process columns according to manifest entries 3162 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3163 datasetParentProject = self.synStore.getDatasetProject(self.datasetId) 3164 if specifySchema: 3165 if columnTypeDict == {}: 3166 logger.error("Did not provide a columnTypeDict.") 3167 # create list of columns: 3168 cols = [] 3169 3170 for col in self.tableToLoad.columns: 3171 if col in table_schema_by_cname: 3172 col_type = table_schema_by_cname[col]["columnType"] 3173 max_size = ( 3174 table_schema_by_cname[col]["maximumSize"] 3175 if "maximumSize" in table_schema_by_cname[col].keys() 3176 else 100 3177 ) 3178 max_list_len = 250 3179 if max_size and max_list_len: 3180 cols.append( 3181 Column( 3182 name=col, 3183 columnType=col_type, 3184 maximumSize=max_size, 3185 maximumListLength=max_list_len, 3186 ) 3187 ) 3188 elif max_size: 3189 cols.append( 3190 Column(name=col, columnType=col_type, maximumSize=max_size) 3191 ) 3192 else: 3193 cols.append(Column(name=col, columnType=col_type)) 3194 else: 3195 # TODO add warning that the given col was not found and it's max size is set to 100 3196 cols.append(Column(name=col, columnType="STRING", maximumSize=100)) 3197 3198 # adds new columns to schema 3199 for col in cols: 3200 current_table.addColumn(col) 3201 3202 table_result = self.synStore.syn.store( 3203 current_table, isRestricted=self.restrict 3204 ) 3205 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3206 # self.synapse_entity_tracker.add(synapse_id=table_result.schema.id, entity=table_result.schema) 3207 self.synapse_entity_tracker.remove(synapse_id=table_result.id) 3208 3209 # wait for synapse store to finish 3210 sleep(1) 3211 3212 # build schema and table from columns and store with necessary restrictions 3213 schema = Schema( 3214 name=self.tableName, columns=cols, parent=datasetParentProject 3215 ) 3216 schema.id = self.existingTableId 3217 table = Table(schema, self.tableToLoad, etag=existing_results.etag) 3218 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3219 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3220 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3221 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3222 else: 3223 logging.error("Must specify a schema for table replacements") 3224 3225 # remove system metadata from manifest 3226 existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True) 3227 return self.existingTableId 3228 3229 @tracer.start_as_current_span("TableOperations::_get_auth_token") 3230 def _get_auth_token( 3231 self, 3232 ): 3233 authtoken = None 3234 3235 # Get access token from environment variable if available 3236 # Primarily useful for testing environments, with other possible usefulness for containers 3237 env_access_token = os.getenv("SYNAPSE_ACCESS_TOKEN") 3238 if env_access_token: 3239 authtoken = env_access_token 3240 return authtoken 3241 3242 # Get token from authorization header 3243 # Primarily useful for API endpoint functionality 3244 if "Authorization" in self.synStore.syn.default_headers: 3245 authtoken = self.synStore.syn.default_headers["Authorization"].split( 3246 "Bearer " 3247 )[-1] 3248 return authtoken 3249 3250 # retrive credentials from synapse object 3251 # Primarily useful for local users, could only be stored here when a .synapseConfig file is used, but including to be safe 3252 synapse_object_creds = self.synStore.syn.credentials 3253 if hasattr(synapse_object_creds, "_token"): 3254 authtoken = synapse_object_creds.secret 3255 3256 # Try getting creds from .synapseConfig file if it exists 3257 # Primarily useful for local users. Seems to correlate with credentials stored in synaspe object when logged in 3258 if os.path.exists(CONFIG.synapse_configuration_path): 3259 config = get_config_file(CONFIG.synapse_configuration_path) 3260 3261 # check which credentials are provided in file 3262 if config.has_option("authentication", "authtoken"): 3263 authtoken = config.get("authentication", "authtoken") 3264 3265 # raise error if required credentials are not found 3266 if not authtoken: 3267 raise NameError( 3268 "authtoken credentials could not be found in the environment, synapse object, or the .synapseConfig file" 3269 ) 3270 3271 return authtoken 3272 3273 @tracer.start_as_current_span("TableOperations::upsertTable") 3274 def upsertTable(self, dmge: DataModelGraphExplorer): 3275 """ 3276 Method to upsert rows from a new manifest into an existing table on synapse 3277 For upsert functionality to work, primary keys must follow the naming convention of <componenet>_id 3278 `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality. 3279 Currently it is required to use -tcn "display label" with table upserts. 3280 3281 3282 Args: 3283 dmge: DataModelGraphExplorer instance 3284 3285 Returns: 3286 existingTableId: synID of the already existing table that had its metadata replaced 3287 """ 3288 3289 authtoken = self._get_auth_token() 3290 3291 synapseDB = SynapseDatabase( 3292 auth_token=authtoken, 3293 project_id=self.synStore.getDatasetProject(self.datasetId), 3294 syn=self.synStore.syn, 3295 synapse_entity_tracker=self.synapse_entity_tracker, 3296 ) 3297 3298 try: 3299 # Try performing upsert 3300 synapseDB.upsert_table_rows( 3301 table_name=self.tableName, data=self.tableToLoad 3302 ) 3303 except SynapseHTTPError as ex: 3304 # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload 3305 if "Id is not a valid column name or id" in str(ex): 3306 self._update_table_uuid_column(dmge) 3307 synapseDB.upsert_table_rows( 3308 table_name=self.tableName, data=self.tableToLoad 3309 ) 3310 # Raise if other error 3311 else: 3312 raise ex 3313 3314 return self.existingTableId 3315 3316 @tracer.start_as_current_span("TableOperations::_update_table_uuid_column") 3317 def _update_table_uuid_column( 3318 self, 3319 dmge: DataModelGraphExplorer, 3320 ) -> None: 3321 """Removes the `Uuid` column when present, and relpaces with an `Id` column 3322 Used to enable backwards compatability for manifests using the old `Uuid` convention 3323 3324 Args: 3325 dmge: DataModelGraphExplorer instance 3326 3327 Returns: 3328 None 3329 """ 3330 3331 # Get the columns of the schema 3332 schema = self.synapse_entity_tracker.get( 3333 synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False 3334 ) 3335 3336 cols = self.synStore.syn.getTableColumns(schema) 3337 3338 # Iterate through columns until `Uuid` column is found 3339 for col in cols: 3340 if col.name.lower() == "uuid": 3341 # See if schema has `Uuid` column specified 3342 try: 3343 uuid_col_in_schema = dmge.is_class_in_schema(col.name) 3344 except KeyError: 3345 uuid_col_in_schema = False 3346 3347 # If there is, then create a new `Id` column from scratch 3348 if uuid_col_in_schema: 3349 new_col = Column(columnType="STRING", maximumSize=64, name="Id") 3350 schema.addColumn(new_col) 3351 schema = self.synStore.syn.store(schema) 3352 # self.synapse_entity_tracker.add(synapse_id=schema.id, entity=schema) 3353 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3354 self.synapse_entity_tracker.remove(synapse_id=schema.id) 3355 # If there is not, then use the old `Uuid` column as a basis for the new `Id` column 3356 else: 3357 # Build ColumnModel that will be used for new column 3358 id_column = Column( 3359 name="Id", 3360 columnType="STRING", 3361 maximumSize=64, 3362 defaultValue=None, 3363 maximumListLength=1, 3364 ) 3365 new_col_response = self.synStore.syn.store(id_column) 3366 3367 # Define columnChange body 3368 columnChangeDict = { 3369 "concreteType": "org.sagebionetworks.repo.model.table.TableSchemaChangeRequest", 3370 "entityId": self.existingTableId, 3371 "changes": [ 3372 { 3373 "oldColumnId": col["id"], 3374 "newColumnId": new_col_response["id"], 3375 } 3376 ], 3377 } 3378 3379 self.synStore.syn._async_table_update( 3380 table=self.existingTableId, 3381 changes=[columnChangeDict], 3382 wait=False, 3383 ) 3384 break 3385 3386 return 3387 3388 @tracer.start_as_current_span("TableOperations::updateTable") 3389 def updateTable( 3390 self, 3391 update_col: str = "Id", 3392 ): 3393 """ 3394 Method to update an existing table with a new column 3395 3396 Args: 3397 updateCol: column to index the old and new tables on 3398 3399 Returns: 3400 existingTableId: synID of the already existing table that had its metadata replaced 3401 """ 3402 existing_table, existing_results = self.synStore.get_synapse_table( 3403 self.existingTableId 3404 ) 3405 3406 self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col) 3407 # store table with existing etag data and impose restrictions as appropriate 3408 table_result = self.synStore.syn.store( 3409 Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag), 3410 isRestricted=self.restrict, 3411 ) 3412 # We cannot store the Table to the `synapse_entity_tracker` because there is 3413 # not `Schema` on the table object. The above `.store()` function call would 3414 # also update the ETag of the entity within Synapse. Remove it from the tracker 3415 # and re-retrieve it later on if needed again. 3416 self.synapse_entity_tracker.remove(synapse_id=table_result.tableId) 3417 3418 return self.existingTableId 3419 3420 3421class DatasetFileView: 3422 """Helper class to create temporary dataset file views. 3423 This class can be used in conjunction with a 'with' statement. 3424 This will ensure that the file view is deleted automatically. 3425 See SynapseStorage.getDatasetAnnotationsBatch for example usage. 3426 """ 3427 3428 def __init__( 3429 self, 3430 datasetId: str, 3431 synapse: Synapse, 3432 name: str = None, 3433 temporary: bool = True, 3434 parentId: str = None, 3435 ) -> None: 3436 """Create a file view scoped to a dataset folder. 3437 3438 Args: 3439 datasetId (str): Synapse ID for a dataset folder/project. 3440 synapse (Synapse): Used for Synapse requests. 3441 name (str): Name of the file view (temporary or not). 3442 temporary (bool): Whether to delete the file view on exit 3443 of either a 'with' statement or Python entirely. 3444 parentId (str, optional): Synapse ID specifying where to 3445 store the file view. Defaults to datasetId. 3446 """ 3447 3448 self.datasetId = datasetId 3449 self.synapse = synapse 3450 self.is_temporary = temporary 3451 3452 if name is None: 3453 self.name = f"schematic annotation file view for {self.datasetId}" 3454 3455 if self.is_temporary: 3456 uid = secrets.token_urlsafe(5) 3457 self.name = f"{self.name} - UID {uid}" 3458 3459 # TODO: Allow a DCC admin to configure a "universal parent" 3460 # Such as a Synapse project writeable by everyone. 3461 self.parentId = datasetId if parentId is None else parentId 3462 3463 # TODO: Create local sharing setting to hide from everyone else 3464 view_schema = EntityViewSchema( 3465 name=self.name, 3466 parent=self.parentId, 3467 scopes=self.datasetId, 3468 includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER], 3469 addDefaultViewColumns=False, 3470 addAnnotationColumns=True, 3471 ) 3472 3473 # TODO: Handle failure due to insufficient permissions by 3474 # creating a temporary new project to store view 3475 self.view_schema = self.synapse.store(view_schema) 3476 3477 # These are filled in after calling `self.query()` 3478 self.results = None 3479 self.table = None 3480 3481 # Ensure deletion of the file view (last resort) 3482 if self.is_temporary: 3483 atexit.register(self.delete) 3484 3485 def __enter__(self): 3486 """Return file view when entering 'with' statement.""" 3487 return self 3488 3489 def __exit__(self, exc_type, exc_value, traceback): 3490 """Delete file view when exiting 'with' statement.""" 3491 if self.is_temporary: 3492 self.delete() 3493 3494 def delete(self): 3495 """Delete the file view on Synapse without deleting local table.""" 3496 if self.view_schema is not None: 3497 self.synapse.delete(self.view_schema) 3498 self.view_schema = None 3499 3500 def query(self, tidy=True, force=False): 3501 """Retrieve file view as a data frame (raw format sans index).""" 3502 if self.table is None or force: 3503 fileview_id = self.view_schema["id"] 3504 self.results = self.synapse.tableQuery(f"select * from {fileview_id}") 3505 self.table = self.results.asDataFrame( 3506 rowIdAndVersionInIndex=False, 3507 na_values=STR_NA_VALUES_FILTERED, 3508 keep_default_na=False, 3509 ) 3510 if tidy: 3511 self.tidy_table() 3512 return self.table 3513 3514 def tidy_table(self): 3515 """Convert raw file view data frame into more usable format.""" 3516 assert self.table is not None, "Must call `self.query()` first." 3517 self._fix_default_columns() 3518 self._fix_list_columns() 3519 self._fix_int_columns() 3520 return self.table 3521 3522 def _fix_default_columns(self): 3523 """Rename default columns to match schematic expectations.""" 3524 3525 # Drop ROW_VERSION column if present 3526 if "ROW_VERSION" in self.table: 3527 del self.table["ROW_VERSION"] 3528 3529 # Rename id column to entityId and set as data frame index 3530 if "ROW_ID" in self.table: 3531 self.table["entityId"] = "syn" + self.table["ROW_ID"].astype(str) 3532 self.table = self.table.set_index("entityId", drop=False) 3533 del self.table["ROW_ID"] 3534 3535 # Rename ROW_ETAG column to eTag and place at end of data frame 3536 if "ROW_ETAG" in self.table: 3537 row_etags = self.table.pop("ROW_ETAG") 3538 3539 # eTag column may already present if users annotated data without submitting manifest 3540 # we're only concerned with the new values and not the existing ones 3541 if "eTag" in self.table: 3542 del self.table["eTag"] 3543 3544 self.table.insert(len(self.table.columns), "eTag", row_etags) 3545 3546 return self.table 3547 3548 def _get_columns_of_type(self, types): 3549 """Helper function to get list of columns of a given type(s).""" 3550 matching_columns = [] 3551 for header in self.results.headers: 3552 if header.columnType in types: 3553 matching_columns.append(header.name) 3554 return matching_columns 3555 3556 def _fix_list_columns(self): 3557 """Fix formatting of list-columns.""" 3558 list_types = {"STRING_LIST", "INTEGER_LIST", "BOOLEAN_LIST"} 3559 list_columns = self._get_columns_of_type(list_types) 3560 for col in list_columns: 3561 self.table[col] = self.table[col].apply(lambda x: ", ".join(x)) 3562 return self.table 3563 3564 def _fix_int_columns(self): 3565 """Ensure that integer-columns are actually integers.""" 3566 int_columns = self._get_columns_of_type({"INTEGER"}) 3567 for col in int_columns: 3568 # Coercing to string because NaN is a floating point value 3569 # and cannot exist alongside integers in a column 3570 def to_int_fn(x): 3571 return "" if np.isnan(x) else str(int(x)) 3572 3573 self.table[col] = self.table[col].apply(to_int_fn) 3574 return self.table
91@dataclass 92class ManifestDownload(object): 93 """ 94 syn: an object of type synapseclient. 95 manifest_id: id of a manifest 96 synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities 97 """ 98 99 syn: synapseclient.Synapse 100 manifest_id: str 101 synapse_entity_tracker: SynapseEntityTracker = field( 102 default_factory=SynapseEntityTracker 103 ) 104 105 def _download_manifest_to_folder(self, use_temporary_folder: bool = True) -> File: 106 """ 107 Try downloading a manifest to a specific folder (temporary or not). When the 108 `use_temporary_folder` is set to True, the manifest will be downloaded to a 109 temporary folder. This is useful for when the code is running as an API server 110 where multiple requests are being made at the same time. This will prevent 111 multiple requests from overwriting the same manifest file. When the 112 `use_temporary_folder` is set to False, the manifest will be downloaded to the 113 default manifest folder. 114 115 Args: 116 use_temporary_folder: boolean argument indicating if a temporary folder 117 should be used to store the manifest file. This is useful when running 118 this code as an API server where multiple requests could be made at the 119 same time. This is set to False when the code is being used from the 120 CLI. Defaults to True. 121 122 Return: 123 manifest_data: A Synapse file entity of the downloaded manifest 124 """ 125 manifest_data = self.synapse_entity_tracker.get( 126 synapse_id=self.manifest_id, 127 syn=self.syn, 128 download_file=False, 129 retrieve_if_not_present=False, 130 ) 131 current_span = trace.get_current_span() 132 if ( 133 manifest_data 134 and (file_handle := manifest_data.get("_file_handle", None)) 135 and current_span.is_recording() 136 ): 137 current_span.set_attribute( 138 "schematic.manifest_size", file_handle.get("contentSize", 0) 139 ) 140 141 if manifest_data and manifest_data.path: 142 return manifest_data 143 144 if "SECRETS_MANAGER_SECRETS" in os.environ: 145 temporary_manifest_storage = "/var/tmp/temp_manifest_download" 146 cleanup_temporary_storage( 147 temporary_manifest_storage, time_delta_seconds=3600 148 ) 149 # create a new directory to store manifest 150 if not os.path.exists(temporary_manifest_storage): 151 os.mkdir(temporary_manifest_storage) 152 # create temporary folders for storing manifests 153 download_location = create_temp_folder( 154 path=temporary_manifest_storage, 155 prefix=f"{self.manifest_id}-{time.time()}-", 156 ) 157 else: 158 if use_temporary_folder: 159 download_location = create_temp_folder( 160 path=CONFIG.manifest_folder, 161 prefix=f"{self.manifest_id}-{time.time()}-", 162 ) 163 else: 164 download_location = CONFIG.manifest_folder 165 166 manifest_data = self.synapse_entity_tracker.get( 167 synapse_id=self.manifest_id, 168 syn=self.syn, 169 download_file=True, 170 retrieve_if_not_present=True, 171 download_location=download_location, 172 ) 173 174 # This is doing a rename of the downloaded file. The reason this is important 175 # is that if we are re-using a file that was previously downloaded, but the 176 # file had been renamed. The file downloaded from the Synapse client is just 177 # a direct copy of that renamed file. This code will set the name of the file 178 # to the original name that was used to download the file. Note: An MD5 checksum 179 # of the file will still be performed so if the file has changed, it will be 180 # downloaded again. 181 filename = manifest_data._file_handle.fileName 182 if filename != os.path.basename(manifest_data.path): 183 parent_folder = os.path.dirname(manifest_data.path) 184 manifest_original_name_and_path = os.path.join(parent_folder, filename) 185 186 self.syn.cache.remove( 187 file_handle_id=manifest_data.dataFileHandleId, path=manifest_data.path 188 ) 189 os.rename(manifest_data.path, manifest_original_name_and_path) 190 manifest_data.path = manifest_original_name_and_path 191 self.syn.cache.add( 192 file_handle_id=manifest_data.dataFileHandleId, 193 path=manifest_original_name_and_path, 194 md5=manifest_data._file_handle.contentMd5, 195 ) 196 197 return manifest_data 198 199 def _entity_type_checking(self) -> str: 200 """ 201 check the entity type of the id that needs to be downloaded 202 Return: 203 if the entity type is wrong, raise an error 204 """ 205 # check the type of entity 206 entity_type = entity_type_mapping( 207 syn=self.syn, 208 entity_id=self.manifest_id, 209 synapse_entity_tracker=self.synapse_entity_tracker, 210 ) 211 if entity_type != "file": 212 logger.error( 213 f"You are using entity type: {entity_type}. Please provide a file ID" 214 ) 215 216 def download_manifest( 217 self, 218 newManifestName: str = "", 219 manifest_df: pd.DataFrame = pd.DataFrame(), 220 use_temporary_folder: bool = True, 221 ) -> Union[str, File]: 222 """ 223 Download a manifest based on a given manifest id. 224 Args: 225 newManifestName(optional): new name of a manifest that gets downloaded. 226 manifest_df(optional): a dataframe containing name and id of manifests in a given asset view 227 Return: 228 manifest_data: synapse entity file object 229 """ 230 231 # enables retrying if user does not have access to uncensored manifest 232 # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location 233 manifest_data = "" 234 235 # check entity type 236 self._entity_type_checking() 237 238 # download a manifest 239 try: 240 manifest_data = self._download_manifest_to_folder( 241 use_temporary_folder=use_temporary_folder 242 ) 243 except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError): 244 # if there's an error getting an uncensored manifest, try getting the censored manifest 245 if not manifest_df.empty: 246 censored_regex = re.compile(".*censored.*") 247 censored = manifest_df["name"].str.contains(censored_regex) 248 new_manifest_id = manifest_df[censored]["id"][0] 249 self.manifest_id = new_manifest_id 250 try: 251 manifest_data = self._download_manifest_to_folder( 252 use_temporary_folder=use_temporary_folder 253 ) 254 except ( 255 SynapseUnmetAccessRestrictions, 256 SynapseAuthenticationError, 257 ) as e: 258 raise PermissionError( 259 "You don't have access to censored and uncensored manifests in this dataset." 260 ) from e 261 else: 262 logger.error( 263 f"You don't have access to the requested resource: {self.manifest_id}" 264 ) 265 266 if newManifestName and os.path.exists(manifest_data.get("path")): 267 # Rename the file we just made to the new name 268 new_manifest_filename = newManifestName + ".csv" 269 270 # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest. 271 parent_folder = os.path.dirname(manifest_data.get("path")) 272 273 new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename) 274 275 # Copy file to new location. The purpose of using a copy instead of a rename 276 # is to avoid any potential issues with the file being used in another 277 # process. This avoids any potential race or code cocurrency conditions. 278 shutil.copyfile(src=manifest_data["path"], dst=new_manifest_path_name) 279 280 # Adding this to cache will allow us to re-use the already downloaded 281 # manifest file for up to 1 hour. 282 self.syn.cache.add( 283 file_handle_id=manifest_data.dataFileHandleId, 284 path=new_manifest_path_name, 285 md5=manifest_data._file_handle.contentMd5, 286 ) 287 288 # Update file names/paths in manifest_data 289 manifest_data["name"] = new_manifest_filename 290 manifest_data["filename"] = new_manifest_filename 291 manifest_data["path"] = new_manifest_path_name 292 293 return manifest_data
syn: an object of type synapseclient. manifest_id: id of a manifest synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
216 def download_manifest( 217 self, 218 newManifestName: str = "", 219 manifest_df: pd.DataFrame = pd.DataFrame(), 220 use_temporary_folder: bool = True, 221 ) -> Union[str, File]: 222 """ 223 Download a manifest based on a given manifest id. 224 Args: 225 newManifestName(optional): new name of a manifest that gets downloaded. 226 manifest_df(optional): a dataframe containing name and id of manifests in a given asset view 227 Return: 228 manifest_data: synapse entity file object 229 """ 230 231 # enables retrying if user does not have access to uncensored manifest 232 # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location 233 manifest_data = "" 234 235 # check entity type 236 self._entity_type_checking() 237 238 # download a manifest 239 try: 240 manifest_data = self._download_manifest_to_folder( 241 use_temporary_folder=use_temporary_folder 242 ) 243 except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError): 244 # if there's an error getting an uncensored manifest, try getting the censored manifest 245 if not manifest_df.empty: 246 censored_regex = re.compile(".*censored.*") 247 censored = manifest_df["name"].str.contains(censored_regex) 248 new_manifest_id = manifest_df[censored]["id"][0] 249 self.manifest_id = new_manifest_id 250 try: 251 manifest_data = self._download_manifest_to_folder( 252 use_temporary_folder=use_temporary_folder 253 ) 254 except ( 255 SynapseUnmetAccessRestrictions, 256 SynapseAuthenticationError, 257 ) as e: 258 raise PermissionError( 259 "You don't have access to censored and uncensored manifests in this dataset." 260 ) from e 261 else: 262 logger.error( 263 f"You don't have access to the requested resource: {self.manifest_id}" 264 ) 265 266 if newManifestName and os.path.exists(manifest_data.get("path")): 267 # Rename the file we just made to the new name 268 new_manifest_filename = newManifestName + ".csv" 269 270 # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest. 271 parent_folder = os.path.dirname(manifest_data.get("path")) 272 273 new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename) 274 275 # Copy file to new location. The purpose of using a copy instead of a rename 276 # is to avoid any potential issues with the file being used in another 277 # process. This avoids any potential race or code cocurrency conditions. 278 shutil.copyfile(src=manifest_data["path"], dst=new_manifest_path_name) 279 280 # Adding this to cache will allow us to re-use the already downloaded 281 # manifest file for up to 1 hour. 282 self.syn.cache.add( 283 file_handle_id=manifest_data.dataFileHandleId, 284 path=new_manifest_path_name, 285 md5=manifest_data._file_handle.contentMd5, 286 ) 287 288 # Update file names/paths in manifest_data 289 manifest_data["name"] = new_manifest_filename 290 manifest_data["filename"] = new_manifest_filename 291 manifest_data["path"] = new_manifest_path_name 292 293 return manifest_data
Download a manifest based on a given manifest id.
Arguments:
- newManifestName(optional): new name of a manifest that gets downloaded.
- manifest_df(optional): a dataframe containing name and id of manifests in a given asset view
Return:
manifest_data: synapse entity file object
296class SynapseStorage(BaseStorage): 297 """Implementation of Storage interface for datasets/files stored on Synapse. 298 Provides utilities to list files in a specific project; update files annotations, create fileviews, etc. 299 300 TODO: Need to define the interface and rename and/or refactor some of the methods below. 301 """ 302 303 @tracer.start_as_current_span("SynapseStorage::__init__") 304 def __init__( 305 self, 306 token: Optional[str] = None, # optional parameter retrieved from browser cookie 307 access_token: Optional[str] = None, 308 project_scope: Optional[list] = None, 309 synapse_cache_path: Optional[str] = None, 310 perform_query: Optional[bool] = True, 311 columns: Optional[list] = None, 312 where_clauses: Optional[list] = None, 313 ) -> None: 314 """Initializes a SynapseStorage object. 315 316 Args: 317 token (Optional[str], optional): 318 Optional token parameter as found in browser cookie upon login to synapse. 319 Defaults to None. 320 access_token (Optional[list], optional): 321 Optional access token (personal or oauth). 322 Defaults to None. 323 project_scope (Optional[list], optional): Defaults to None. 324 synapse_cache_path (Optional[str], optional): 325 Location of synapse cache. 326 Defaults to None. 327 TODO: 328 Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how `query_fileview` is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands. 329 """ 330 self.syn = self.login(synapse_cache_path, access_token) 331 self.project_scope = project_scope 332 self.storageFileview = CONFIG.synapse_master_fileview_id 333 self.manifest = CONFIG.synapse_manifest_basename 334 self.root_synapse_cache = self.syn.cache.cache_root_dir 335 self.synapse_entity_tracker = SynapseEntityTracker() 336 if perform_query: 337 self.query_fileview(columns=columns, where_clauses=where_clauses) 338 339 # TODO: When moving this over to a regular cron-job the following logic should be 340 # out of `manifest_download`: 341 # if "SECRETS_MANAGER_SECRETS" in os.environ: 342 # temporary_manifest_storage = "/var/tmp/temp_manifest_download" 343 # cleanup_temporary_storage(temporary_manifest_storage, time_delta_seconds=3600) 344 @tracer.start_as_current_span("SynapseStorage::_purge_synapse_cache") 345 def _purge_synapse_cache( 346 self, maximum_storage_allowed_cache_gb: int = 1, minute_buffer: int = 15 347 ) -> None: 348 """ 349 Purge synapse cache if it exceeds a certain size. Default to 1GB. 350 Args: 351 maximum_storage_allowed_cache_gb (int): the maximum storage allowed 352 before purging cache. Default is 1 GB. 353 minute_buffer (int): All files created this amount of time or older will be deleted 354 """ 355 # try clearing the cache 356 # scan a directory and check size of files 357 if os.path.exists(self.root_synapse_cache): 358 maximum_storage_allowed_cache_bytes = maximum_storage_allowed_cache_gb * ( 359 1024**3 360 ) 361 nbytes = get_dir_size(self.root_synapse_cache) 362 dir_size_bytes = check_synapse_cache_size(directory=self.root_synapse_cache) 363 # if 1 GB has already been taken, purge cache before 15 min 364 if dir_size_bytes >= maximum_storage_allowed_cache_bytes: 365 num_of_deleted_files = clear_synapse_cache( 366 self.syn.cache, minutes=minute_buffer 367 ) 368 logger.info( 369 f"{num_of_deleted_files} files have been deleted from {self.root_synapse_cache}" 370 ) 371 else: 372 # on AWS, OS takes around 14-17% of our ephemeral storage (20GiB) 373 # instead of guessing how much space that we left, print out .synapseCache here 374 logger.info(f"the total size of .synapseCache is: {nbytes} bytes") 375 376 @tracer.start_as_current_span("SynapseStorage::query_fileview") 377 def query_fileview( 378 self, 379 columns: Optional[list] = None, 380 where_clauses: Optional[list] = None, 381 force_requery: Optional[bool] = False, 382 ) -> None: 383 """ 384 Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute. 385 Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes. 386 Args: 387 columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns. 388 where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None. 389 force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False. 390 """ 391 self._purge_synapse_cache() 392 393 # Initialize to assume that the new fileview query will be different from what may already be stored. Initializes to True because generally one will not have already been performed 394 self.new_query_different = True 395 396 # If a query has already been performed, store the query 397 previous_query_built = hasattr(self, "fileview_query") 398 if previous_query_built: 399 previous_query = self.fileview_query 400 401 # Build a query with the current given parameters and check to see if it is different from the previous 402 self._build_query(columns=columns, where_clauses=where_clauses) 403 if previous_query_built: 404 self.new_query_different = self.fileview_query != previous_query 405 406 # Only perform the query if it is different from the previous query or we are forcing new results to be retrieved 407 if self.new_query_different or force_requery: 408 try: 409 self.storageFileviewTable = self.syn.tableQuery( 410 query=self.fileview_query, 411 ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) 412 except SynapseHTTPError as exc: 413 exception_text = str(exc) 414 if "Unknown column path" in exception_text: 415 raise ValueError( 416 "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation." 417 ) 418 elif "Unknown column" in exception_text: 419 missing_column = exception_text.split("Unknown column ")[-1] 420 raise ValueError( 421 f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview." 422 ) 423 else: 424 raise AccessCredentialsError(self.storageFileview) 425 426 @staticmethod 427 def build_clause_from_dataset_id( 428 dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None 429 ) -> str: 430 """ 431 Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized. 432 Args: 433 dataset_id: Synapse ID of a dataset that should be used to limit the query 434 dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query 435 Returns: 436 clause for the query or an empty string if no dataset ID is provided 437 """ 438 # Calling this method without specifying synIDs will complete but will not scope the view 439 if (not dataset_id) and (not dataset_folder_list): 440 return "" 441 442 # This will be used to gather files under a dataset recursively with a fileview query instead of walking 443 if dataset_folder_list: 444 search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list) 445 return f"parentId IN ({search_folders})" 446 447 # `dataset_id` should be provided when all files are stored directly under the dataset folder 448 return f"parentId='{dataset_id}'" 449 450 def _build_query( 451 self, columns: Optional[list] = None, where_clauses: Optional[list] = None 452 ): 453 """ 454 Method to build a query for Synapse FileViews 455 Args: 456 columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns. 457 where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None. 458 self.storageFileview (str): Synapse FileView ID 459 self.project_scope (Optional[list], optional): List of project IDs to be used to scope the query. Defaults to None. 460 Gets added to where_clauses, more included for backwards compatability and as a more user friendly way of subsetting the view in a simple way. 461 """ 462 if columns is None: 463 columns = [] 464 if where_clauses is None: 465 where_clauses = [] 466 467 if self.project_scope: 468 project_scope_clause = f"projectId IN {tuple(self.project_scope + [''])}" 469 where_clauses.append(project_scope_clause) 470 471 if where_clauses: 472 where_clauses = " AND ".join(where_clauses) 473 where_clauses = f"WHERE {where_clauses} ;" 474 else: 475 where_clauses = ";" 476 477 if columns: 478 columns = ",".join(columns) 479 else: 480 columns = "*" 481 482 self.fileview_query = ( 483 f"SELECT {columns} FROM {self.storageFileview} {where_clauses}" 484 ) 485 486 return 487 488 @staticmethod 489 @tracer.start_as_current_span("SynapseStorage::login") 490 def login( 491 synapse_cache_path: Optional[str] = None, 492 access_token: Optional[str] = None, 493 ) -> synapseclient.Synapse: 494 """Login to Synapse 495 496 Args: 497 access_token (Optional[str], optional): A synapse access token. Defaults to None. 498 synapse_cache_path (Optional[str]): location of synapse cache 499 500 Raises: 501 ValueError: If unable to loging with access token 502 503 Returns: 504 synapseclient.Synapse: A Synapse object that is logged in 505 """ 506 if not access_token: 507 access_token = os.getenv("SYNAPSE_ACCESS_TOKEN") 508 509 # login using a token 510 if access_token: 511 try: 512 syn = synapseclient.Synapse( 513 cache_root_dir=synapse_cache_path, 514 debug=False, 515 skip_checks=True, 516 cache_client=False, 517 ) 518 syn.login(authToken=access_token, silent=True) 519 except SynapseHTTPError as exc: 520 raise ValueError( 521 "No access to resources. Please make sure that your token is correct" 522 ) from exc 523 else: 524 # login using synapse credentials provided by user in .synapseConfig (default) file 525 syn = synapseclient.Synapse( 526 configPath=CONFIG.synapse_configuration_path, 527 cache_root_dir=synapse_cache_path, 528 debug=False, 529 skip_checks=True, 530 cache_client=False, 531 ) 532 syn.login(silent=True) 533 534 # set user id attribute 535 current_span = trace.get_current_span() 536 if current_span.is_recording(): 537 current_span.set_attribute("user.id", syn.credentials.owner_id) 538 539 return syn 540 541 def missing_entity_handler(method): 542 def wrapper(*args, **kwargs): 543 try: 544 return method(*args, **kwargs) 545 except SynapseHTTPError as ex: 546 str_message = str(ex).replace("\n", "") 547 if "trash" in str_message or "does not exist" in str_message: 548 logging.warning(str_message) 549 return None 550 else: 551 raise ex 552 553 return wrapper 554 555 def async_missing_entity_handler(method): 556 """Decorator to handle missing entities in async methods.""" 557 558 async def wrapper(*args: Any, **kwargs: Any) -> Any: 559 try: 560 return await method(*args, **kwargs) 561 except SynapseHTTPError as ex: 562 str_message = str(ex).replace("\n", "") 563 if "trash" in str_message or "does not exist" in str_message: 564 logging.warning(str_message) 565 return None 566 else: 567 raise ex 568 569 return wrapper 570 571 def getStorageFileviewTable(self): 572 """Returns the storageFileviewTable obtained during initialization.""" 573 return self.storageFileviewTable 574 575 def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]: 576 """Gets the paginated results of the REST call to Synapse to check what projects the current user has access to. 577 578 Args: 579 currentUserId: synapse id for the user whose projects we want to get. 580 581 Returns: 582 A dictionary with a next page token and the results. 583 """ 584 all_results = self.syn.restGET( 585 "/projects/user/{principalId}".format(principalId=currentUserId) 586 ) 587 588 while ( 589 "nextPageToken" in all_results 590 ): # iterate over next page token in results while there is any 591 results_token = self.syn.restGET( 592 "/projects/user/{principalId}?nextPageToken={nextPageToken}".format( 593 principalId=currentUserId, 594 nextPageToken=all_results["nextPageToken"], 595 ) 596 ) 597 all_results["results"].extend(results_token["results"]) 598 599 if "nextPageToken" in results_token: 600 all_results["nextPageToken"] = results_token["nextPageToken"] 601 else: 602 del all_results["nextPageToken"] 603 604 return all_results 605 606 @tracer.start_as_current_span("SynapseStorage::getStorageProjects") 607 def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]: 608 """Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute. 609 610 Returns: 611 A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName). 612 """ 613 614 # get the set of all storage Synapse project accessible for this pipeline 615 storageProjects = self.storageFileviewTable["projectId"].unique() 616 617 # get the set of storage Synapse project accessible for this user 618 # get a list of projects from Synapse 619 current_user_project_headers = self.synapse_entity_tracker.get_project_headers( 620 current_user_id=self.syn.credentials.owner_id, syn=self.syn 621 ) 622 project_id_to_name_dict = {} 623 current_user_projects = [] 624 for project_header in current_user_project_headers: 625 project_id_to_name_dict[project_header.get("id")] = project_header.get( 626 "name" 627 ) 628 current_user_projects.append(project_header.get("id")) 629 630 # find set of user projects that are also in this pipeline's storage projects set 631 storageProjects = list(set(storageProjects) & set(current_user_projects)) 632 633 # Limit projects to scope if specified 634 if project_scope: 635 storageProjects = list(set(storageProjects) & set(project_scope)) 636 637 if not storageProjects: 638 raise Warning( 639 f"There are no projects that the user has access to that match the criteria of the specified project scope: {project_scope}" 640 ) 641 642 # prepare a return list of project IDs and names 643 projects = [] 644 for projectId in storageProjects: 645 project_name_from_project_header = project_id_to_name_dict.get(projectId) 646 projects.append((projectId, project_name_from_project_header)) 647 648 sorted_projects_list = sorted(projects, key=lambda tup: tup[0]) 649 650 return sorted_projects_list 651 652 @tracer.start_as_current_span("SynapseStorage::getStorageDatasetsInProject") 653 def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]: 654 """Gets all datasets in folder under a given storage project that the current user has access to. 655 656 Args: 657 projectId: synapse ID of a storage project. 658 659 Returns: 660 A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName). 661 None: If the projectId cannot be found on Synapse. 662 """ 663 664 # select all folders and fetch their names from within the storage project; 665 # if folder content type is defined, only select folders that contain datasets 666 if "contentType" in self.storageFileviewTable.columns: 667 foldersTable = self.storageFileviewTable[ 668 (self.storageFileviewTable["contentType"] == "dataset") 669 & (self.storageFileviewTable["projectId"] == projectId) 670 ] 671 else: 672 foldersTable = self.storageFileviewTable[ 673 (self.storageFileviewTable["type"] == "folder") 674 & (self.storageFileviewTable["parentId"] == projectId) 675 ] 676 677 # get an array of tuples (folderId, folderName) 678 # some folders are part of datasets; others contain datasets 679 # each dataset parent is the project; folders part of a dataset have another folder as a parent 680 # to get folders if and only if they contain datasets for each folder 681 # check if folder's parent is the project; if so that folder contains a dataset, 682 # unless the folder list has already been filtered to dataset folders based on contentType attribute above 683 684 datasetList = [] 685 folderProperties = ["id", "name"] 686 for folder in list( 687 foldersTable[folderProperties].itertuples(index=False, name=None) 688 ): 689 datasetList.append(folder) 690 691 sorted_dataset_list = sorted(datasetList, key=lambda tup: tup[0]) 692 693 return sorted_dataset_list 694 695 @tracer.start_as_current_span("SynapseStorage::getFilesInStorageDataset") 696 def getFilesInStorageDataset( 697 self, datasetId: str, fileNames: List = None, fullpath: bool = True 698 ) -> List[Tuple[str, str]]: 699 """Gets all files (excluding manifest files) in a given dataset folder. 700 701 Args: 702 datasetId: synapse ID of a storage dataset. 703 fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g. 704 metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present. 705 fullpath: if True return the full path as part of this filename; otherwise return just base filename 706 707 Returns: 708 A list of files; the list consists of tuples (fileId, fileName). 709 710 Raises: 711 ValueError: Dataset ID not found. 712 """ 713 file_list = [] 714 715 # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view 716 if self.storageFileviewTable.empty: 717 raise ValueError( 718 f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again." 719 ) 720 child_path = self.storageFileviewTable.loc[ 721 self.storageFileviewTable["parentId"] == datasetId, "path" 722 ] 723 if child_path.empty: 724 raise LookupError( 725 f"Dataset {datasetId} could not be found in fileview {self.storageFileview}." 726 ) 727 child_path = child_path.iloc[0] 728 729 # Get the dataset path by eliminating the child's portion of the path to account for nested datasets 730 parent = child_path.split("/")[:-1] 731 parent = "/".join(parent) 732 733 # When querying, only include files to exclude entity files and subdirectories 734 where_clauses = [create_like_statement(parent), "type='file'"] 735 736 # Requery the fileview to specifically get the files in the given dataset 737 self.query_fileview(columns=["id", "path"], where_clauses=where_clauses) 738 739 # Exclude manifest files 740 non_manifest_files = self.storageFileviewTable.loc[ 741 ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"), 742 :, 743 ] 744 745 # Remove all files that are not in the list of fileNames 746 if fileNames: 747 filename_regex = "|".join(fileNames) 748 749 matching_files = non_manifest_files["path"].str.contains( 750 filename_regex, case=False, regex=True 751 ) 752 753 non_manifest_files = non_manifest_files.loc[matching_files, :] 754 755 # Truncate path if necessary 756 if not fullpath: 757 non_manifest_files.path = non_manifest_files.path.apply(os.path.basename) 758 759 # Return list of files as expected by other methods 760 file_list = list(non_manifest_files.itertuples(index=False, name=None)) 761 762 return file_list 763 764 def _get_manifest_id(self, manifest: pd.DataFrame) -> str: 765 """If both censored and uncensored manifests are present, return uncensored manifest; if only one manifest is present, return manifest id of that manifest; if more than two manifests are present, return the manifest id of the first one. 766 Args: 767 manifest: a dataframe contains name and id of manifests in a given asset view 768 769 Return: 770 manifest_syn_id: id of a given censored or uncensored manifest 771 """ 772 censored_regex = re.compile(".*censored.*") 773 censored = manifest["name"].str.contains(censored_regex) 774 if any(censored): 775 # Try to use uncensored manifest first 776 not_censored = ~censored 777 if any(not_censored): 778 manifest_syn_id = manifest[not_censored]["id"].iloc[0] 779 # if only censored manifests are available, just use the first censored manifest 780 else: 781 manifest_syn_id = manifest["id"].iloc[0] 782 783 # otherwise, use the first (implied only) version that exists 784 else: 785 manifest_syn_id = manifest["id"].iloc[0] 786 787 return manifest_syn_id 788 789 @tracer.start_as_current_span("SynapseStorage::getDatasetManifest") 790 def getDatasetManifest( 791 self, 792 datasetId: str, 793 downloadFile: bool = False, 794 newManifestName: str = "", 795 use_temporary_folder: bool = True, 796 ) -> Union[str, File]: 797 """Gets the manifest associated with a given dataset. 798 799 Args: 800 datasetId: synapse ID of a storage dataset. 801 downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not. 802 newManifestName: new name of a manifest that gets downloaded 803 use_temporary_folder: boolean argument indicating if a temporary folder 804 should be used to store the manifest file. This is useful when running 805 this code as an API server where multiple requests could be made at the 806 same time. This is set to False when the code is being used from the 807 CLI. Defaults to True. 808 809 Returns: 810 manifest_syn_id (String): Synapse ID of exisiting manifest file. 811 manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True. 812 "" (String): No pre-exisiting manifest in dataset. 813 """ 814 manifest_data = "" 815 816 # get a list of files containing the manifest for this dataset (if any) 817 all_files = self.storageFileviewTable 818 819 # construct regex based on manifest basename in the config 820 manifest_re = re.compile(os.path.basename(self.manifest) + ".*.[tc]sv") 821 822 # search manifest based on given manifest basename regex above 823 # and return a dataframe containing name and id of manifests in a given asset view 824 manifest = all_files[ 825 (all_files["name"].str.contains(manifest_re, regex=True)) 826 & (all_files["parentId"] == datasetId) 827 ] 828 829 manifest = manifest[["id", "name"]] 830 831 # if there is no pre-exisiting manifest in the specified dataset 832 if manifest.empty: 833 logger.warning( 834 f"Could not find a manifest that fits basename {self.manifest} in asset view and dataset {datasetId}" 835 ) 836 return "" 837 838 # if there is an exisiting manifest 839 else: 840 manifest_syn_id = self._get_manifest_id(manifest) 841 if downloadFile: 842 md = ManifestDownload( 843 self.syn, 844 manifest_id=manifest_syn_id, 845 synapse_entity_tracker=self.synapse_entity_tracker, 846 ) 847 manifest_data = md.download_manifest( 848 newManifestName=newManifestName, 849 manifest_df=manifest, 850 use_temporary_folder=use_temporary_folder, 851 ) 852 # TO DO: revisit how downstream code handle manifest_data. If the downstream code would break when manifest_data is an empty string, 853 # then we should catch the error here without returning an empty string. 854 if not manifest_data: 855 logger.debug( 856 f"No manifest data returned. Please check if you have successfully downloaded manifest: {manifest_syn_id}" 857 ) 858 return manifest_data 859 return manifest_syn_id 860 861 def getDataTypeFromManifest(self, manifestId: str): 862 """Fetch a manifest and return data types of all columns 863 Args: 864 manifestId: synapse ID of a manifest 865 """ 866 # get manifest file path 867 manifest_entity = self.synapse_entity_tracker.get( 868 synapse_id=manifestId, syn=self.syn, download_file=True 869 ) 870 manifest_filepath = manifest_entity.path 871 872 # load manifest dataframe 873 manifest = load_df( 874 manifest_filepath, 875 preserve_raw_input=False, 876 data_model=False, 877 ) 878 879 # convert the dataFrame to use best possible dtypes. 880 manifest_new = manifest.convert_dtypes() 881 882 # get data types of columns 883 result = manifest_new.dtypes.to_frame("dtypes").reset_index() 884 885 # return the result as a dictionary 886 result_dict = result.set_index("index")["dtypes"].astype(str).to_dict() 887 888 return result_dict 889 890 def _get_files_metadata_from_dataset( 891 self, datasetId: str, only_new_files: bool, manifest: pd.DataFrame = None 892 ) -> Optional[dict]: 893 """retrieve file ids under a particular datasetId 894 895 Args: 896 datasetId (str): a dataset id 897 only_new_files (bool): if only adding new files that are not already exist 898 manifest (pd.DataFrame): metadata manifest dataframe. Default to None. 899 900 Returns: 901 a dictionary that contains filename and entityid under a given datasetId or None if there is nothing under a given dataset id are not available 902 """ 903 dataset_files = self.getFilesInStorageDataset(datasetId) 904 if dataset_files: 905 dataset_file_names_id_dict = self._get_file_entityIds( 906 dataset_files, only_new_files=only_new_files, manifest=manifest 907 ) 908 return dataset_file_names_id_dict 909 else: 910 return None 911 912 def add_entity_id_and_filename( 913 self, datasetId: str, manifest: pd.DataFrame 914 ) -> pd.DataFrame: 915 """add entityid and filename column to an existing manifest assuming entityId column is not already present 916 917 Args: 918 datasetId (str): dataset syn id 919 manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty 920 921 Returns: 922 pd.DataFrame: returns a pandas dataframe 923 """ 924 # get file names and entity ids of a given dataset 925 dataset_files_dict = self._get_files_metadata_from_dataset( 926 datasetId, only_new_files=False 927 ) 928 929 if dataset_files_dict: 930 # turn manifest dataframe back to a dictionary for operation 931 manifest_dict = manifest.to_dict("list") 932 933 # update Filename column 934 # add entityId column to the end 935 manifest_dict.update(dataset_files_dict) 936 937 # if the component column exists in existing manifest, fill up that column 938 if "Component" in manifest_dict.keys(): 939 manifest_dict["Component"] = manifest_dict["Component"] * max( 940 1, len(manifest_dict["Filename"]) 941 ) 942 943 # turn dictionary back to a dataframe 944 manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient="index") 945 manifest_df_updated = manifest_df_index.transpose() 946 947 # fill na with empty string 948 manifest_df_updated = manifest_df_updated.fillna("") 949 950 # drop index 951 manifest_df_updated = manifest_df_updated.reset_index(drop=True) 952 953 return manifest_df_updated 954 else: 955 return manifest 956 957 def fill_in_entity_id_filename( 958 self, datasetId: str, manifest: pd.DataFrame 959 ) -> Tuple[List, pd.DataFrame]: 960 """fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present. 961 962 Args: 963 datasetId (str): dataset syn id 964 manifest (pd.DataFrame): existing manifest dataframe. 965 966 Returns: 967 Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe 968 """ 969 # get dataset file names and entity id as a list of tuple 970 dataset_files = self.getFilesInStorageDataset(datasetId) 971 972 # update manifest with additional filenames, if any 973 # note that if there is an existing manifest and there are files in the dataset 974 # the columns Filename and entityId are assumed to be present in manifest schema 975 # TODO: use idiomatic panda syntax 976 if not dataset_files: 977 manifest = manifest.fillna("") 978 return dataset_files, manifest 979 980 all_files = self._get_file_entityIds( 981 dataset_files=dataset_files, only_new_files=False, manifest=manifest 982 ) 983 new_files = self._get_file_entityIds( 984 dataset_files=dataset_files, only_new_files=True, manifest=manifest 985 ) 986 987 all_files = pd.DataFrame(all_files) 988 new_files = pd.DataFrame(new_files) 989 990 # update manifest so that it contains new dataset files 991 manifest = ( 992 pd.concat([manifest, new_files], sort=False) 993 .reset_index() 994 .drop("index", axis=1) 995 ) 996 997 # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata 998 manifest_reindex = manifest.set_index("entityId") 999 all_files_reindex = all_files.set_index("entityId") 1000 all_files_reindex_like_manifest = all_files_reindex.reindex_like( 1001 manifest_reindex 1002 ) 1003 1004 # Check if individual file paths in manifest and from synapse match 1005 file_paths_match = ( 1006 manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"] 1007 ) 1008 1009 # If all the paths do not match, update the manifest with the filepaths from synapse 1010 if not file_paths_match.all(): 1011 manifest_reindex.loc[ 1012 ~file_paths_match, "Filename" 1013 ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"] 1014 1015 # reformat manifest for further use 1016 manifest = manifest_reindex.reset_index() 1017 entityIdCol = manifest.pop("entityId") 1018 manifest.insert(len(manifest.columns), "entityId", entityIdCol) 1019 1020 manifest = manifest.fillna("") 1021 return dataset_files, manifest 1022 1023 @tracer.start_as_current_span("SynapseStorage::updateDatasetManifestFiles") 1024 def updateDatasetManifestFiles( 1025 self, dmge: DataModelGraphExplorer, datasetId: str, store: bool = True 1026 ) -> Union[Tuple[str, pd.DataFrame], None]: 1027 """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any. 1028 1029 Args: 1030 dmge: DataModelGraphExplorer Instance 1031 datasetId: synapse ID of a storage dataset. 1032 store: if set to True store updated manifest in asset store; if set to False 1033 return a Pandas dataframe containing updated manifest but do not store to asset store 1034 1035 1036 Returns: 1037 Synapse ID of updated manifest and Pandas dataframe containing the updated manifest. 1038 If there is no existing manifest or if the manifest does not have an entityId column, return None 1039 """ 1040 1041 # get existing manifest Synapse ID 1042 manifest_id = self.getDatasetManifest(datasetId) 1043 1044 # if there is no manifest return None 1045 if not manifest_id: 1046 return None 1047 1048 manifest_entity = self.synapse_entity_tracker.get( 1049 synapse_id=manifest_id, syn=self.syn, download_file=True 1050 ) 1051 manifest_filepath = manifest_entity.path 1052 manifest = load_df(manifest_filepath) 1053 1054 # If the manifest does not have an entityId column, trigger a new manifest to be generated 1055 if "entityId" not in manifest.columns: 1056 return None 1057 1058 manifest_is_file_based = "Filename" in manifest.columns 1059 1060 if manifest_is_file_based: 1061 # update manifest with additional filenames, if any 1062 # note that if there is an existing manifest and there are files in the dataset 1063 # the columns Filename and entityId are assumed to be present in manifest schema 1064 # TODO: use idiomatic panda syntax 1065 dataset_files, manifest = self.fill_in_entity_id_filename( 1066 datasetId, manifest 1067 ) 1068 if dataset_files: 1069 # update the manifest file, so that it contains the relevant entity IDs 1070 if store: 1071 manifest.to_csv(manifest_filepath, index=False) 1072 1073 # store manifest and update associated metadata with manifest on Synapse 1074 manifest_id = self.associateMetadataWithFiles( 1075 dmge, manifest_filepath, datasetId 1076 ) 1077 1078 return manifest_id, manifest 1079 1080 def _get_file_entityIds( 1081 self, 1082 dataset_files: List, 1083 only_new_files: bool = False, 1084 manifest: pd.DataFrame = None, 1085 ): 1086 """ 1087 Get a dictionary of files in a dataset. Either files that are not in the current manifest or all files 1088 1089 Args: 1090 manifest: metadata manifest 1091 dataset_file: List of all files in a dataset 1092 only_new_files: boolean to control whether only new files are returned or all files in the dataset 1093 Returns: 1094 files: dictionary of file names and entityIDs, with scope as specified by `only_new_files` 1095 """ 1096 files = {"Filename": [], "entityId": []} 1097 1098 if only_new_files: 1099 if manifest is None: 1100 raise UnboundLocalError( 1101 "No manifest was passed in, a manifest is required when `only_new_files` is True." 1102 ) 1103 1104 if "entityId" not in manifest.columns: 1105 raise ValueError( 1106 "The manifest in your dataset and/or top level folder must contain the 'entityId' column. " 1107 "Please generate an empty manifest without annotations, manually add annotations to the " 1108 "appropriate files in the manifest, and then try again." 1109 ) 1110 1111 # find new files (that are not in the current manifest) if any 1112 for file_id, file_name in dataset_files: 1113 if not file_id in manifest["entityId"].values: 1114 files["Filename"].append(file_name) 1115 files["entityId"].append(file_id) 1116 else: 1117 # get all files 1118 for file_id, file_name in dataset_files: 1119 files["Filename"].append(file_name) 1120 files["entityId"].append(file_id) 1121 1122 return files 1123 1124 @tracer.start_as_current_span("SynapseStorage::getProjectManifests") 1125 def getProjectManifests( 1126 self, projectId: str 1127 ) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]: 1128 """Gets all metadata manifest files across all datasets in a specified project. 1129 1130 Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest 1131 as a list of tuples, one for each manifest: 1132 [ 1133 ( 1134 (datasetId, dataName), 1135 (manifestId, manifestName), 1136 (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema 1137 ), 1138 ... 1139 ] 1140 1141 TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface 1142 """ 1143 component = None 1144 entity = None 1145 manifests = [] 1146 1147 datasets = self.getStorageDatasetsInProject(projectId) 1148 1149 for datasetId, datasetName in datasets: 1150 # encode information about the manifest in a simple list (so that R clients can unpack it) 1151 # eventually can serialize differently 1152 1153 # Get synID of manifest for a dataset 1154 manifestId = self.getDatasetManifest(datasetId) 1155 1156 # If a manifest exists, get the annotations for it, else return base 'manifest' tuple 1157 if manifestId: 1158 annotations = self.getFileAnnotations(manifestId) 1159 1160 # If manifest has annotations specifying component, use that 1161 if annotations and "Component" in annotations: 1162 component = annotations["Component"] 1163 entity = self.synapse_entity_tracker.get( 1164 synapse_id=manifestId, syn=self.syn, download_file=False 1165 ) 1166 manifest_name = entity["properties"]["name"] 1167 1168 # otherwise download the manifest and parse for information 1169 elif not annotations or "Component" not in annotations: 1170 logging.debug( 1171 f"No component annotations have been found for manifest {manifestId}. " 1172 "The manifest will be downloaded and parsed instead. " 1173 "For increased speed, add component annotations to manifest." 1174 ) 1175 1176 manifest_info = self.getDatasetManifest( 1177 datasetId, downloadFile=True 1178 ) 1179 manifest_name = manifest_info["properties"].get("name", "") 1180 1181 if not manifest_name: 1182 logger.error(f"Failed to download manifests from {datasetId}") 1183 1184 manifest_path = manifest_info["path"] 1185 1186 manifest_df = load_df(manifest_path) 1187 1188 # Get component from component column if it exists 1189 if ( 1190 "Component" in manifest_df 1191 and not manifest_df["Component"].empty 1192 ): 1193 list(set(manifest_df["Component"])) 1194 component = list(set(manifest_df["Component"])) 1195 1196 # Added to address issues raised during DCA testing 1197 if "" in component: 1198 component.remove("") 1199 1200 if len(component) == 1: 1201 component = component[0] 1202 elif len(component) > 1: 1203 logging.warning( 1204 f"Manifest {manifestId} is composed of multiple components. Schematic does not support mulit-component manifests at this time." 1205 "Behavior of manifests with multiple components is undefined" 1206 ) 1207 else: 1208 manifest_name = "" 1209 component = None 1210 if component: 1211 manifest = ( 1212 (datasetId, datasetName), 1213 (manifestId, manifest_name), 1214 (component, component), 1215 ) 1216 elif manifestId: 1217 logging.debug( 1218 f"Manifest {manifestId} does not have an associated Component" 1219 ) 1220 manifest = ( 1221 (datasetId, datasetName), 1222 (manifestId, manifest_name), 1223 ("", ""), 1224 ) 1225 else: 1226 manifest = ( 1227 (datasetId, datasetName), 1228 ("", ""), 1229 ("", ""), 1230 ) 1231 1232 if manifest: 1233 manifests.append(manifest) 1234 1235 return manifests 1236 1237 def upload_project_manifests_to_synapse( 1238 self, dmge: DataModelGraphExplorer, projectId: str 1239 ) -> List[str]: 1240 """Upload all metadata manifest files across all datasets in a specified project as tables in Synapse. 1241 1242 Returns: String of all the manifest_table_ids of all the manifests that have been loaded. 1243 """ 1244 1245 manifests = [] 1246 manifest_loaded = [] 1247 datasets = self.getStorageDatasetsInProject(projectId) 1248 1249 for datasetId, datasetName in datasets: 1250 # encode information about the manifest in a simple list (so that R clients can unpack it) 1251 # eventually can serialize differently 1252 1253 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1254 1255 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1256 if manifest_info: 1257 manifest_id = manifest_info["properties"]["id"] 1258 manifest_name = manifest_info["properties"]["name"] 1259 manifest_path = manifest_info["path"] 1260 manifest_df = load_df(manifest_path) 1261 manifest_table_id = uploadDB( 1262 dmge=dmge, 1263 manifest=manifest, 1264 datasetId=datasetId, 1265 table_name=datasetName, 1266 ) 1267 manifest_loaded.append(datasetName) 1268 return manifest_loaded 1269 1270 def upload_annotated_project_manifests_to_synapse( 1271 self, projectId: str, path_to_json_ld: str, dry_run: bool = False 1272 ) -> List[str]: 1273 """ 1274 Purpose: 1275 For all manifests in a project, upload them as a table and add annotations manifest csv. 1276 Assumes the manifest is already present as a CSV in a dataset in the project. 1277 1278 """ 1279 # Instantiate DataModelParser 1280 data_model_parser = DataModelParser(path_to_data_model=path_to_json_ld) 1281 # Parse Model 1282 parsed_data_model = data_model_parser.parse_model() 1283 1284 # Instantiate DataModelGraph 1285 data_model_grapher = DataModelGraph(parsed_data_model) 1286 1287 # Generate graph 1288 graph_data_model = data_model_grapher.generate_data_model_graph() 1289 1290 # Instantiate DataModelGraphExplorer 1291 dmge = DataModelGraphExplorer(graph_data_model) 1292 1293 manifests = [] 1294 manifest_loaded = [] 1295 datasets = self.getStorageDatasetsInProject(projectId) 1296 for datasetId, datasetName in datasets: 1297 # encode information about the manifest in a simple list (so that R clients can unpack it) 1298 # eventually can serialize differently 1299 1300 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1301 manifests.append(manifest) 1302 1303 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1304 1305 if manifest_info: 1306 manifest_id = manifest_info["properties"]["id"] 1307 manifest_name = manifest_info["properties"]["name"] 1308 manifest_path = manifest_info["path"] 1309 manifest = ( 1310 (datasetId, datasetName), 1311 (manifest_id, manifest_name), 1312 ("", ""), 1313 ) 1314 if not dry_run: 1315 self.associateMetadataWithFiles( 1316 dmge, manifest_path, datasetId, manifest_record_type="table" 1317 ) 1318 manifest_loaded.append(manifest) 1319 1320 return manifests, manifest_loaded 1321 1322 def move_entities_to_new_project( 1323 self, 1324 projectId: str, 1325 newProjectId: str, 1326 returnEntities: bool = False, 1327 dry_run: bool = False, 1328 ): 1329 """ 1330 For each manifest csv in a project, look for all the entitiy ids that are associated. 1331 Look up the entitiy in the files, move the entity to new project. 1332 """ 1333 1334 manifests = [] 1335 manifest_loaded = [] 1336 datasets = self.getStorageDatasetsInProject(projectId) 1337 if datasets: 1338 for datasetId, datasetName in datasets: 1339 # encode information about the manifest in a simple list (so that R clients can unpack it) 1340 # eventually can serialize differently 1341 1342 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1343 manifests.append(manifest) 1344 1345 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1346 if manifest_info: 1347 manifest_id = manifest_info["properties"]["id"] 1348 manifest_name = manifest_info["properties"]["name"] 1349 manifest_path = manifest_info["path"] 1350 manifest_df = load_df(manifest_path) 1351 1352 manifest = ( 1353 (datasetId, datasetName), 1354 (manifest_id, manifest_name), 1355 ("", ""), 1356 ) 1357 manifest_loaded.append(manifest) 1358 1359 annotation_entities = self.storageFileviewTable[ 1360 (self.storageFileviewTable["id"].isin(manifest_df["entityId"])) 1361 & (self.storageFileviewTable["type"] == "folder") 1362 ]["id"] 1363 1364 if returnEntities: 1365 for entityId in annotation_entities: 1366 if not dry_run: 1367 moved_entity = self.syn.move(entityId, datasetId) 1368 self.synapse_entity_tracker.add( 1369 synapse_id=moved_entity.id, entity=moved_entity 1370 ) 1371 else: 1372 logging.info( 1373 f"{entityId} will be moved to folder {datasetId}." 1374 ) 1375 else: 1376 # generate project folder 1377 archive_project_folder = Folder( 1378 projectId + "_archive", parent=newProjectId 1379 ) 1380 archive_project_folder = self.syn.store(archive_project_folder) 1381 self.synapse_entity_tracker.add( 1382 synapse_id=archive_project_folder.id, 1383 entity=archive_project_folder, 1384 ) 1385 1386 # generate dataset folder 1387 dataset_archive_folder = Folder( 1388 "_".join([datasetId, datasetName, "archive"]), 1389 parent=archive_project_folder.id, 1390 ) 1391 dataset_archive_folder = self.syn.store(dataset_archive_folder) 1392 self.synapse_entity_tracker.add( 1393 synapse_id=dataset_archive_folder.id, 1394 entity=dataset_archive_folder, 1395 ) 1396 1397 for entityId in annotation_entities: 1398 # move entities to folder 1399 if not dry_run: 1400 moved_entity = self.syn.move( 1401 entityId, dataset_archive_folder.id 1402 ) 1403 self.synapse_entity_tracker.add( 1404 synapse_id=moved_entity.id, entity=moved_entity 1405 ) 1406 else: 1407 logging.info( 1408 f"{entityId} will be moved to folder {dataset_archive_folder.id}." 1409 ) 1410 else: 1411 raise LookupError( 1412 f"No datasets were found in the specified project: {projectId}. Re-check specified master_fileview in CONFIG and retry." 1413 ) 1414 return manifests, manifest_loaded 1415 1416 @tracer.start_as_current_span("SynapseStorage::get_synapse_table") 1417 def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable]: 1418 """Download synapse table as a pd dataframe; return table schema and etags as results too 1419 1420 Args: 1421 synapse_id: synapse ID of the table to query 1422 """ 1423 1424 results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id)) 1425 df = results.asDataFrame( 1426 rowIdAndVersionInIndex=False, 1427 na_values=STR_NA_VALUES_FILTERED, 1428 keep_default_na=False, 1429 ) 1430 1431 return df, results 1432 1433 @missing_entity_handler 1434 @tracer.start_as_current_span("SynapseStorage::uploadDB") 1435 def uploadDB( 1436 self, 1437 dmge: DataModelGraphExplorer, 1438 manifest: pd.DataFrame, 1439 datasetId: str, 1440 table_name: str, 1441 restrict: bool = False, 1442 table_manipulation: str = "replace", 1443 table_column_names: str = "class_label", 1444 ): 1445 """ 1446 Method to upload a database to an asset store. In synapse, this will upload a metadata table 1447 1448 Args: 1449 dmge: DataModelGraphExplorer object 1450 manifest: pd.Df manifest to upload 1451 datasetId: synID of the dataset for the manifest 1452 table_name: name of the table to be uploaded 1453 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 1454 existingTableId: str of the synId of the existing table, if one already exists 1455 table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert) 1456 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 1457 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 1458 display label formatting. 1459 Returns: 1460 manifest_table_id: synID of the uploaded table 1461 manifest: the original manifset 1462 table_manifest: manifest formatted appropriately for the table 1463 1464 """ 1465 1466 col_schema, table_manifest = self.formatDB( 1467 dmge=dmge, manifest=manifest, table_column_names=table_column_names 1468 ) 1469 1470 manifest_table_id = self.buildDB( 1471 datasetId, 1472 table_name, 1473 col_schema, 1474 table_manifest, 1475 table_manipulation, 1476 dmge, 1477 restrict, 1478 ) 1479 1480 return manifest_table_id, manifest, table_manifest 1481 1482 @tracer.start_as_current_span("SynapseStorage::formatDB") 1483 def formatDB(self, dmge, manifest, table_column_names): 1484 """ 1485 Method to format a manifest appropriatly for upload as table 1486 1487 Args: 1488 dmge: DataModelGraphExplorer object 1489 manifest: pd.Df manifest to upload 1490 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 1491 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 1492 display label formatting. 1493 Returns: 1494 col_schema: schema for table columns: type, size, etc 1495 table_manifest: formatted manifest 1496 1497 """ 1498 # Rename the manifest columns to display names to match fileview 1499 1500 blacklist_chars = ["(", ")", ".", " ", "-"] 1501 manifest_columns = manifest.columns.tolist() 1502 1503 table_manifest = deepcopy(manifest) 1504 1505 if table_column_names == "display_name": 1506 cols = table_manifest.columns 1507 1508 elif table_column_names == "display_label": 1509 cols = [ 1510 str(col).translate({ord(x): "" for x in blacklist_chars}) 1511 for col in manifest_columns 1512 ] 1513 1514 elif table_column_names == "class_label": 1515 cols = [ 1516 get_class_label_from_display_name(str(col)).translate( 1517 {ord(x): "" for x in blacklist_chars} 1518 ) 1519 for col in manifest_columns 1520 ] 1521 else: 1522 ValueError( 1523 f"The provided table_column_name: {table_column_names} is not valid, please resubmit with an allowed value only." 1524 ) 1525 1526 cols = list(map(lambda x: x.replace("EntityId", "entityId"), cols)) 1527 1528 # Reset column names in table manifest 1529 table_manifest.columns = cols 1530 1531 # move entity id to end of df 1532 entity_col = table_manifest.pop("entityId") 1533 table_manifest.insert(len(table_manifest.columns), "entityId", entity_col) 1534 1535 # Get the column schema 1536 col_schema = as_table_columns(table_manifest) 1537 1538 # Set Id column length to 64 (for some reason not being auto set.) 1539 for i, col in enumerate(col_schema): 1540 if col["name"].lower() == "id": 1541 col_schema[i]["maximumSize"] = 64 1542 1543 return col_schema, table_manifest 1544 1545 @tracer.start_as_current_span("SynapseStorage::buildDB") 1546 def buildDB( 1547 self, 1548 datasetId: str, 1549 table_name: str, 1550 col_schema: List, 1551 table_manifest: pd.DataFrame, 1552 table_manipulation: str, 1553 dmge: DataModelGraphExplorer, 1554 restrict: bool = False, 1555 ): 1556 """ 1557 Method to construct the table appropriately: create new table, replace existing, or upsert new into existing 1558 Calls TableOperations class to execute 1559 1560 Args: 1561 datasetId: synID of the dataset for the manifest 1562 table_name: name of the table to be uploaded 1563 col_schema: schema for table columns: type, size, etc from `formatDB` 1564 table_manifest: formatted manifest that can be uploaded as a table 1565 table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert) 1566 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 1567 1568 Returns: 1569 manifest_table_id: synID of the uploaded table 1570 1571 """ 1572 table_parent_id = self.getDatasetProject(datasetId=datasetId) 1573 existing_table_id = self.syn.findEntityId( 1574 name=table_name, parent=table_parent_id 1575 ) 1576 tableOps = TableOperations( 1577 synStore=self, 1578 tableToLoad=table_manifest, 1579 tableName=table_name, 1580 datasetId=datasetId, 1581 existingTableId=existing_table_id, 1582 restrict=restrict, 1583 synapse_entity_tracker=self.synapse_entity_tracker, 1584 ) 1585 1586 if not table_manipulation or existing_table_id is None: 1587 manifest_table_id = tableOps.createTable( 1588 columnTypeDict=col_schema, 1589 specifySchema=True, 1590 ) 1591 elif existing_table_id is not None: 1592 if table_manipulation.lower() == "replace": 1593 manifest_table_id = tableOps.replaceTable( 1594 specifySchema=True, 1595 columnTypeDict=col_schema, 1596 ) 1597 elif table_manipulation.lower() == "upsert": 1598 manifest_table_id = tableOps.upsertTable( 1599 dmge=dmge, 1600 ) 1601 elif table_manipulation.lower() == "update": 1602 manifest_table_id = tableOps.updateTable() 1603 1604 if table_manipulation and table_manipulation.lower() == "upsert": 1605 table_entity = self.synapse_entity_tracker.get( 1606 synapse_id=existing_table_id or manifest_table_id, 1607 syn=self.syn, 1608 download_file=False, 1609 ) 1610 annos = OldAnnotations( 1611 id=table_entity.id, 1612 etag=table_entity.etag, 1613 values=table_entity.annotations, 1614 ) 1615 annos["primary_key"] = table_manifest["Component"][0] + "_id" 1616 annos = self.syn.set_annotations(annos) 1617 table_entity.etag = annos.etag 1618 table_entity.annotations = annos 1619 1620 return manifest_table_id 1621 1622 @tracer.start_as_current_span("SynapseStorage::upload_manifest_file") 1623 def upload_manifest_file( 1624 self, 1625 manifest, 1626 metadataManifestPath, 1627 datasetId, 1628 restrict_manifest, 1629 component_name="", 1630 ): 1631 # Update manifest to have the new entityId column 1632 manifest.to_csv(metadataManifestPath, index=False) 1633 1634 # store manifest to Synapse as a CSV 1635 # update file name 1636 file_name_full = metadataManifestPath.split("/")[-1] 1637 file_extension = file_name_full.split(".")[-1] 1638 1639 # Differentiate "censored" and "uncensored" manifest 1640 if "censored" in file_name_full: 1641 file_name_new = ( 1642 os.path.basename(CONFIG.synapse_manifest_basename) 1643 + "_" 1644 + component_name 1645 + "_censored" 1646 + "." 1647 + file_extension 1648 ) 1649 else: 1650 file_name_new = ( 1651 os.path.basename(CONFIG.synapse_manifest_basename) 1652 + "_" 1653 + component_name 1654 + "." 1655 + file_extension 1656 ) 1657 1658 manifest_synapse_file = None 1659 try: 1660 # Rename the file to file_name_new then revert 1661 # This is to maintain the original file name in-case other code is 1662 # expecting that the file exists with the original name 1663 original_file_path = metadataManifestPath 1664 new_file_path = os.path.join( 1665 os.path.dirname(metadataManifestPath), file_name_new 1666 ) 1667 os.rename(original_file_path, new_file_path) 1668 1669 manifest_synapse_file = self._store_file_for_manifest_upload( 1670 new_file_path=new_file_path, 1671 dataset_id=datasetId, 1672 existing_file_name=file_name_full, 1673 file_name_new=file_name_new, 1674 restrict_manifest=restrict_manifest, 1675 ) 1676 manifest_synapse_file_id = manifest_synapse_file.id 1677 1678 finally: 1679 # Revert the file name back to the original 1680 os.rename(new_file_path, original_file_path) 1681 1682 if manifest_synapse_file: 1683 manifest_synapse_file.path = original_file_path 1684 1685 return manifest_synapse_file_id 1686 1687 def _store_file_for_manifest_upload( 1688 self, 1689 new_file_path: str, 1690 dataset_id: str, 1691 existing_file_name: str, 1692 file_name_new: str, 1693 restrict_manifest: bool, 1694 ) -> File: 1695 """Handles a create or update of a manifest file that is going to be uploaded. 1696 If we already have a copy of the Entity in memory we will update that instance, 1697 otherwise create a new File instance to be created in Synapse. Once stored 1698 this will add the file to the `synapse_entity_tracker` for future reference. 1699 1700 Args: 1701 new_file_path (str): The path to the new manifest file 1702 dataset_id (str): The Synapse ID of the dataset the manifest is associated with 1703 existing_file_name (str): The name of the existing file 1704 file_name_new (str): The name of the new file 1705 restrict_manifest (bool): Whether the manifest should be restricted 1706 1707 Returns: 1708 File: The stored manifest file 1709 """ 1710 local_tracked_file_instance = ( 1711 self.synapse_entity_tracker.search_local_by_parent_and_name( 1712 name=existing_file_name, parent_id=dataset_id 1713 ) 1714 or self.synapse_entity_tracker.search_local_by_parent_and_name( 1715 name=file_name_new, parent_id=dataset_id 1716 ) 1717 ) 1718 1719 if local_tracked_file_instance: 1720 local_tracked_file_instance.path = new_file_path 1721 local_tracked_file_instance.description = ( 1722 "Manifest for dataset " + dataset_id 1723 ) 1724 manifest_synapse_file = local_tracked_file_instance 1725 else: 1726 manifest_synapse_file = File( 1727 path=new_file_path, 1728 description="Manifest for dataset " + dataset_id, 1729 parent=dataset_id, 1730 name=file_name_new, 1731 ) 1732 1733 manifest_synapse_file = self.syn.store( 1734 manifest_synapse_file, isRestricted=restrict_manifest 1735 ) 1736 1737 self.synapse_entity_tracker.add( 1738 synapse_id=manifest_synapse_file.id, entity=manifest_synapse_file 1739 ) 1740 return manifest_synapse_file 1741 1742 async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]: 1743 """get annotations asynchronously 1744 1745 Args: 1746 synapse_id (str): synapse id of the entity that the annotation belongs 1747 1748 Returns: 1749 Dict[str, Any]: The requested entity bundle matching 1750 <https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html> 1751 """ 1752 return await get_entity_id_bundle2( 1753 entity_id=synapse_id, 1754 request={"includeAnnotations": True}, 1755 synapse_client=self.syn, 1756 ) 1757 1758 async def store_async_annotation(self, annotation_dict: dict) -> Annotations: 1759 """store annotation in an async way 1760 1761 Args: 1762 annotation_dict (dict): annotation in a dictionary format 1763 1764 Returns: 1765 Annotations: The stored annotations. 1766 """ 1767 annotation_data = Annotations.from_dict( 1768 synapse_annotations=annotation_dict["annotations"]["annotations"] 1769 ) 1770 annotation_class = Annotations( 1771 annotations=annotation_data, 1772 etag=annotation_dict["annotations"]["etag"], 1773 id=annotation_dict["annotations"]["id"], 1774 ) 1775 annotation_storage_result = await annotation_class.store_async( 1776 synapse_client=self.syn 1777 ) 1778 local_entity = self.synapse_entity_tracker.get( 1779 synapse_id=annotation_dict["annotations"]["id"], 1780 syn=self.syn, 1781 download_file=False, 1782 retrieve_if_not_present=False, 1783 ) 1784 if local_entity: 1785 local_entity.etag = annotation_storage_result.etag 1786 local_entity.annotations = annotation_storage_result 1787 return annotation_storage_result 1788 1789 def process_row_annotations( 1790 self, 1791 dmge: DataModelGraphExplorer, 1792 metadata_syn: Dict[str, Any], 1793 hide_blanks: bool, 1794 csv_list_regex: str, 1795 annos: Dict[str, Any], 1796 annotation_keys: str, 1797 ) -> Dict[str, Any]: 1798 """Processes metadata annotations based on the logic below: 1799 1. Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is: 1800 An empty or whitespace-only string. 1801 A NaN value (if the annotation is a float). 1802 if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key. 1803 if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key. 1804 1805 2. If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name". 1806 Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key. 1807 1808 3. For any other conditions, assigns the original value of anno_v to the annotation key (anno_k). 1809 1810 4. Returns the updated annotations dictionary. 1811 1812 Args: 1813 dmge (DataModelGraphExplorer): data model graph explorer 1814 metadata_syn (dict): metadata used for Synapse storage 1815 hideBlanks (bool): if true, does not upload annotation keys with blank values. 1816 csv_list_regex (str): Regex to match with comma separated list 1817 annos (Dict[str, Any]): dictionary of annotation returned from synapse 1818 annotation_keys (str): display_label/class_label 1819 1820 Returns: 1821 Dict[str, Any]: annotations as a dictionary 1822 1823 ```mermaid 1824 flowchart TD 1825 A[Start] --> C{Is anno_v empty, whitespace, or NaN?} 1826 C -- Yes --> D{Is hide_blanks True?} 1827 D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing] 1828 D -- No --> F[Assign empty string to annotation key] 1829 C -- No --> G{Is anno_v a string?} 1830 G -- No --> H[Assign original value of anno_v to annotation key] 1831 G -- Yes --> I{Does anno_v match csv_list_regex?} 1832 I -- Yes --> J[Get validation rule of anno_k] 1833 J --> K{Does the validation rule contain 'list'} 1834 K -- Yes --> L[Split anno_v by commas and assign as list] 1835 I -- No --> H 1836 K -- No --> H 1837 ``` 1838 """ 1839 for anno_k, anno_v in metadata_syn.items(): 1840 # Remove keys with nan or empty string values or string that only contains white space from dict of annotations to be uploaded 1841 # if present on current data annotation 1842 if hide_blanks and ( 1843 (isinstance(anno_v, str) and anno_v.strip() == "") 1844 or (isinstance(anno_v, float) and np.isnan(anno_v)) 1845 ): 1846 annos["annotations"]["annotations"].pop(anno_k) if anno_k in annos[ 1847 "annotations" 1848 ]["annotations"].keys() else annos["annotations"]["annotations"] 1849 continue 1850 1851 # Otherwise save annotation as approrpriate 1852 if isinstance(anno_v, float) and np.isnan(anno_v): 1853 annos["annotations"]["annotations"][anno_k] = "" 1854 continue 1855 1856 # Handle strings that match the csv_list_regex and pass the validation rule 1857 if isinstance(anno_v, str) and re.fullmatch(csv_list_regex, anno_v): 1858 # Use a dictionary to dynamically choose the argument 1859 param = ( 1860 {"node_display_name": anno_k} 1861 if annotation_keys == "display_label" 1862 else {"node_label": anno_k} 1863 ) 1864 node_validation_rules = dmge.get_node_validation_rules(**param) 1865 1866 if rule_in_rule_list("list", node_validation_rules): 1867 annos["annotations"]["annotations"][anno_k] = anno_v.split(",") 1868 continue 1869 # default: assign the original value 1870 annos["annotations"]["annotations"][anno_k] = anno_v 1871 1872 return annos 1873 1874 @async_missing_entity_handler 1875 async def format_row_annotations( 1876 self, 1877 dmge: DataModelGraphExplorer, 1878 row: pd.Series, 1879 entityId: str, 1880 hideBlanks: bool, 1881 annotation_keys: str, 1882 ) -> Union[None, Dict[str, Any]]: 1883 """Format row annotations 1884 1885 Args: 1886 dmge (DataModelGraphExplorer): data moodel graph explorer object 1887 row (pd.Series): row of the manifest 1888 entityId (str): entity id of the manifest 1889 hideBlanks (bool): when true, does not upload annotation keys with blank values. When false, upload Annotation keys with empty string values 1890 annotation_keys (str): display_label/class_label 1891 1892 Returns: 1893 Union[None, Dict[str,]]: if entity id is in trash can, return None. Otherwise, return the annotations 1894 """ 1895 # prepare metadata for Synapse storage (resolve display name into a name that Synapse annotations support (e.g no spaces, parenthesis) 1896 # note: the removal of special characters, will apply only to annotation keys; we are not altering the manifest 1897 # this could create a divergence between manifest column and annotations. this should be ok for most use cases. 1898 # columns with special characters are outside of the schema 1899 metadataSyn = {} 1900 blacklist_chars = ["(", ")", ".", " ", "-"] 1901 1902 for k, v in row.to_dict().items(): 1903 if annotation_keys == "display_label": 1904 keySyn = str(k).translate({ord(x): "" for x in blacklist_chars}) 1905 elif annotation_keys == "class_label": 1906 keySyn = get_class_label_from_display_name(str(k)).translate( 1907 {ord(x): "" for x in blacklist_chars} 1908 ) 1909 1910 # Skip `Filename` and `ETag` columns when setting annotations 1911 if keySyn in ["Filename", "ETag", "eTag"]: 1912 continue 1913 1914 # truncate annotation values to 500 characters if the 1915 # size of values is greater than equal to 500 characters 1916 # add an explicit [truncatedByDataCuratorApp] message at the end 1917 # of every truncated message to indicate that the cell value 1918 # has been truncated 1919 if isinstance(v, str) and len(v) >= 500: 1920 v = v[0:472] + "[truncatedByDataCuratorApp]" 1921 1922 metadataSyn[keySyn] = v 1923 1924 # This will first check if the entity is already in memory, and if so, that 1925 # instance is used. Unfortunately, the expected return format needs to match 1926 # the Synapse API, so we need to convert the annotations to the expected format. 1927 entity = self.synapse_entity_tracker.get( 1928 synapse_id=entityId, 1929 syn=self.syn, 1930 download_file=False, 1931 retrieve_if_not_present=False, 1932 ) 1933 if entity is not None: 1934 synapse_annotations = _convert_to_annotations_list( 1935 annotations=entity.annotations 1936 ) 1937 annos = { 1938 "annotations": { 1939 "id": entity.id, 1940 "etag": entity.etag, 1941 "annotations": synapse_annotations, 1942 } 1943 } 1944 else: 1945 annos = await self.get_async_annotation(entityId) 1946 1947 # set annotation(s) for the various objects/items in a dataset on Synapse 1948 csv_list_regex = comma_separated_list_regex() 1949 1950 annos = self.process_row_annotations( 1951 dmge=dmge, 1952 metadata_syn=metadataSyn, 1953 hide_blanks=hideBlanks, 1954 csv_list_regex=csv_list_regex, 1955 annos=annos, 1956 annotation_keys=annotation_keys, 1957 ) 1958 1959 return annos 1960 1961 @missing_entity_handler 1962 @tracer.start_as_current_span("SynapseStorage::format_manifest_annotations") 1963 def format_manifest_annotations(self, manifest, manifest_synapse_id): 1964 """ 1965 Set annotations for the manifest (as a whole) so they can be applied to the manifest table or csv. 1966 For now just getting the Component. 1967 """ 1968 1969 entity = self.synapse_entity_tracker.get( 1970 synapse_id=manifest_synapse_id, syn=self.syn, download_file=False 1971 ) 1972 is_file = entity.concreteType.endswith(".FileEntity") 1973 is_table = entity.concreteType.endswith(".TableEntity") 1974 1975 if is_file: 1976 # Get file metadata 1977 metadata = self.getFileAnnotations(manifest_synapse_id) 1978 1979 # If there is a defined component add it to the metadata. 1980 if "Component" in manifest.columns: 1981 # Gather component information 1982 component = manifest["Component"].unique() 1983 1984 # Double check that only a single component is listed, else raise an error. 1985 try: 1986 len(component) == 1 1987 except ValueError as err: 1988 raise ValueError( 1989 f"Manifest has more than one component. Please check manifest and resubmit." 1990 ) from err 1991 1992 # Add component to metadata 1993 metadata["Component"] = component[0] 1994 1995 elif is_table: 1996 # Get table metadata 1997 metadata = self.getTableAnnotations(manifest_synapse_id) 1998 1999 # Get annotations 2000 annos = OldAnnotations( 2001 id=entity.id, etag=entity.etag, values=entity.annotations 2002 ) 2003 2004 # Add metadata to the annotations 2005 for annos_k, annos_v in metadata.items(): 2006 annos[annos_k] = annos_v 2007 return annos 2008 2009 ''' 2010 def annotate_upload_manifest_table(self, manifest, datasetId, metadataManifestPath, 2011 useSchemaLabel: bool = True, hideBlanks: bool = False, restrict_manifest = False): 2012 """ 2013 Purpose: 2014 Works very similarly to associateMetadataWithFiles except takes in the manifest 2015 rather than the manifest path 2016 2017 """ 2018 2019 # Add uuid for table updates and fill. 2020 if not "Uuid" in manifest.columns: 2021 manifest["Uuid"] = '' 2022 2023 for idx,row in manifest.iterrows(): 2024 if not row["Uuid"]: 2025 gen_uuid = uuid.uuid4() 2026 row["Uuid"] = gen_uuid 2027 manifest.loc[idx, 'Uuid'] = gen_uuid 2028 2029 # add entityId as a column if not already there or 2030 # fill any blanks with an empty string. 2031 if not "entityId" in manifest.columns: 2032 manifest["entityId"] = "" 2033 else: 2034 manifest["entityId"].fillna("", inplace=True) 2035 2036 # get a DataModelGraphExplorer object to ensure schema attribute names used in manifest are translated to schema labels for synapse annotations 2037 dmge = DataModelGraphExplorer() 2038 2039 # Create table name here. 2040 if 'Component' in manifest.columns: 2041 table_name = manifest['Component'][0].lower() + '_synapse_storage_manifest_table' 2042 else: 2043 table_name = 'synapse_storage_manifest_table' 2044 2045 # Upload manifest as a table and get the SynID and manifest 2046 manifest_synapse_table_id, manifest, table_manifest = self.upload_format_manifest_table( 2047 dmge, manifest, datasetId, table_name, restrict = restrict_manifest, useSchemaLabel=useSchemaLabel,) 2048 2049 # Iterate over manifest rows, create Synapse entities and store corresponding entity IDs in manifest if needed 2050 # also set metadata for each synapse entity as Synapse annotations 2051 for idx, row in manifest.iterrows(): 2052 if not row["entityId"]: 2053 # If not using entityIds, fill with manifest_table_id so 2054 row["entityId"] = manifest_synapse_table_id 2055 entityId = '' 2056 else: 2057 # get the entity id corresponding to this row 2058 entityId = row["entityId"] 2059 2060 # Load manifest to synapse as a CSV File 2061 manifest_synapse_file_id = self.upload_manifest_file(manifest, metadataManifestPath, datasetId, restrict_manifest) 2062 2063 # Get annotations for the file manifest. 2064 manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_file_id) 2065 2066 self.syn.set_annotations(manifest_annotations) 2067 2068 logger.info("Associated manifest file with dataset on Synapse.") 2069 2070 # Update manifest Synapse table with new entity id column. 2071 self.make_synapse_table( 2072 table_to_load = table_manifest, 2073 dataset_id = datasetId, 2074 existingTableId = manifest_synapse_table_id, 2075 table_name = table_name, 2076 update_col = 'Uuid', 2077 specify_schema = False, 2078 ) 2079 2080 # Get annotations for the table manifest 2081 manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_table_id) 2082 self.syn.set_annotations(manifest_annotations) 2083 return manifest_synapse_table_id 2084 ''' 2085 2086 def _read_manifest(self, metadataManifestPath: str) -> pd.DataFrame: 2087 """Helper function to read in provided manifest as a pandas DataFrame for subsequent downstream processing. 2088 Args: 2089 metadataManifestPath (str): path where manifest is stored 2090 Returns: 2091 manifest(pd.DataFrame): Manifest loaded as a pandas dataframe 2092 Raises: 2093 FileNotFoundError: Manifest file does not exist at provided path. 2094 """ 2095 # read new manifest csv 2096 try: 2097 load_args = { 2098 "dtype": "string", 2099 } 2100 manifest = load_df( 2101 metadataManifestPath, 2102 preserve_raw_input=False, 2103 allow_na_values=False, 2104 **load_args, 2105 ) 2106 except FileNotFoundError as err: 2107 raise FileNotFoundError( 2108 f"No manifest file was found at this path: {metadataManifestPath}" 2109 ) from err 2110 return manifest 2111 2112 def _add_id_columns_to_manifest( 2113 self, manifest: pd.DataFrame, dmge: DataModelGraphExplorer 2114 ) -> pd.DataFrame: 2115 """ 2116 Ensures that the manifest DataFrame has standardized 'Id' and 'entityId' columns. 2117 2118 - If any case variation of the 'id' column is present (e.g., 'id', 'ID', 'iD'), it is renamed to 'Id'. 2119 - If any case variation of the 'entityid' column is present, it is renamed to 'entityId'. 2120 - If any case variation of the 'uuid' column is present, it is renamed to 'uuid' before further processing. 2121 - If 'Id' is still missing: 2122 - It will be created as an empty column, or 2123 - Derived from a 'Uuid' column, depending on whether 'uuid' is defined in the schema. 2124 - If both 'uuid' and 'Id' columns exist, the 'uuid' column is dropped. 2125 - Missing values in the 'Id' column are filled with generated UUIDs. 2126 - If 'entityId' is still missing, it will be created and filled with empty strings. 2127 - If 'entityId' is already present, any missing values will be replaced with empty strings. 2128 2129 Args: 2130 manifest (pd.DataFrame): The metadata manifest to be updated. 2131 dmge (DataModelGraphExplorer): Data model graph explorer object. 2132 2133 Returns: 2134 pd.DataFrame: The updated manifest with a standardized 'Id' column and an 'entityId' column. 2135 """ 2136 2137 # Normalize any variation of 'id' to 'Id', "entityid" to "entityId", "Uuid" to "uuid" 2138 for col in manifest.columns: 2139 if col.lower() == "id": 2140 manifest = manifest.rename(columns={col: ID_COLUMN}) 2141 if col.lower() == "entityid": 2142 manifest = manifest.rename(columns={col: ENTITY_ID_COLUMN}) 2143 if col.lower() == "uuid": 2144 manifest = manifest.rename(columns={col: UUID_COLUMN}) 2145 2146 # If 'Id' still doesn't exist, see if uuid column exists 2147 # Rename uuid column to "Id" column 2148 if ID_COLUMN not in manifest.columns: 2149 # See if schema has `Uuid` column specified 2150 try: 2151 uuid_col_in_schema = dmge.is_class_in_schema( 2152 "Uuid" 2153 ) or dmge.is_class_in_schema("uuid") 2154 except KeyError: 2155 uuid_col_in_schema = False 2156 2157 # Rename `uuid` column if it wasn't specified in the schema 2158 if UUID_COLUMN in manifest.columns and not uuid_col_in_schema: 2159 manifest = manifest.rename(columns={UUID_COLUMN: ID_COLUMN}) 2160 # If no `uuid` column exists or it is specified in the schema, create a new `Id` column 2161 else: 2162 manifest[ID_COLUMN] = "" 2163 else: 2164 # 'Id' already exists, ignore 'uuid' 2165 if UUID_COLUMN in manifest.columns: 2166 manifest = manifest.drop(columns=[UUID_COLUMN]) 2167 2168 # Fill in UUIDs in the "Id" column if missing 2169 for idx, row in manifest.iterrows(): 2170 if not row["Id"]: 2171 gen_uuid = str(uuid.uuid4()) 2172 row["Id"] = gen_uuid 2173 manifest.loc[idx, ID_COLUMN] = gen_uuid 2174 2175 # Add entityId as a column if not already there 2176 if ENTITY_ID_COLUMN not in manifest: 2177 manifest[ENTITY_ID_COLUMN] = "" 2178 else: 2179 manifest[ENTITY_ID_COLUMN] = manifest[ENTITY_ID_COLUMN].fillna("") 2180 2181 return manifest 2182 2183 def _generate_table_name(self, manifest): 2184 """Helper function to generate a table name for upload to synapse. 2185 2186 Args: 2187 Manifest loaded as a pd.Dataframe 2188 2189 Returns: 2190 table_name (str): Name of the table to load 2191 component_name (str): Name of the manifest component (if applicable) 2192 """ 2193 # Create table name here. 2194 if "Component" in manifest.columns: 2195 component_name = manifest["Component"][0].lower() 2196 table_name = component_name + "_synapse_storage_manifest_table" 2197 else: 2198 component_name = "" 2199 table_name = "synapse_storage_manifest_table" 2200 return table_name, component_name 2201 2202 def _create_entity_id(self, idx, row, manifest, datasetId): 2203 """Helper function to generate an entityId and add it to the appropriate row in the manifest. 2204 Args: 2205 row: current row of manifest being processed 2206 manifest (pd.DataFrame): loaded df containing user supplied data. 2207 datasetId (str): synapse ID of folder containing the dataset 2208 2209 Returns: 2210 manifest (pd.DataFrame): manifest with entityId added to the appropriate row 2211 entityId (str): Generated Entity Id. 2212 2213 """ 2214 rowEntity = Folder(str(uuid.uuid4()), parent=datasetId) 2215 rowEntity = self.syn.store(rowEntity) 2216 entityId = rowEntity["id"] 2217 self.synapse_entity_tracker.add(synapse_id=entityId, entity=rowEntity) 2218 row["entityId"] = entityId 2219 manifest.loc[idx, "entityId"] = entityId 2220 return manifest, entityId 2221 2222 async def _process_store_annos(self, requests: Set[asyncio.Task]) -> None: 2223 """Process annotations and store them on synapse asynchronously 2224 2225 Args: 2226 requests (Set[asyncio.Task]): a set of tasks of formatting annotations created by format_row_annotations function in previous step 2227 2228 Raises: 2229 RuntimeError: raise a run time error if a task failed to complete 2230 """ 2231 while requests: 2232 done_tasks, pending_tasks = await asyncio.wait( 2233 requests, return_when=asyncio.FIRST_COMPLETED 2234 ) 2235 requests = pending_tasks 2236 2237 for completed_task in done_tasks: 2238 try: 2239 annos = completed_task.result() 2240 2241 if isinstance(annos, Annotations): 2242 logger.info(f"Successfully stored annotations for {annos.id}") 2243 else: 2244 # store annotations if they are not None 2245 if annos: 2246 entity_id = annos["annotations"]["id"] 2247 logger.info( 2248 f"Obtained and processed annotations for {entity_id} entity" 2249 ) 2250 requests.add( 2251 asyncio.create_task( 2252 self.store_async_annotation(annotation_dict=annos) 2253 ) 2254 ) 2255 except Exception as e: 2256 raise RuntimeError(f"failed with { repr(e) }.") from e 2257 2258 @tracer.start_as_current_span("SynapseStorage::add_annotations_to_entities_files") 2259 async def add_annotations_to_entities_files( 2260 self, 2261 dmge, 2262 manifest, 2263 manifest_record_type: str, 2264 datasetId: str, 2265 hideBlanks: bool, 2266 manifest_synapse_table_id="", 2267 annotation_keys: str = "class_label", 2268 ): 2269 """ 2270 Depending on upload type add Ids to entityId row. Add anotations to connected 2271 files and folders. Despite the name of this function, it also applies to folders. 2272 2273 Args: 2274 dmge: DataModelGraphExplorer Object 2275 manifest (pd.DataFrame): loaded df containing user supplied data. 2276 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2277 datasetId (str): synapse ID of folder containing the dataset 2278 hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2279 manifest_synapse_table_id (str): Default is an empty string ''. 2280 annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display 2281 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2282 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2283 Returns: 2284 manifest (pd.DataFrame): modified to add entitiyId as appropriate 2285 2286 """ 2287 2288 # Expected behavior is to annotate files if `Filename` is present and if file_annotations_upload is set to True regardless of `-mrt` setting 2289 if "filename" in [col.lower() for col in manifest.columns]: 2290 # get current list of files and store as dataframe 2291 dataset_files = self.getFilesInStorageDataset(datasetId) 2292 files_and_entityIds = self._get_file_entityIds( 2293 dataset_files=dataset_files, only_new_files=False 2294 ) 2295 file_df = pd.DataFrame(files_and_entityIds) 2296 2297 # Merge dataframes to add entityIds 2298 manifest = manifest.merge( 2299 file_df, how="left", on="Filename", suffixes=["_x", None] 2300 ).drop("entityId_x", axis=1) 2301 2302 # Fill `entityId` for each row if missing and annotate entity as appropriate 2303 requests = set() 2304 for idx, row in manifest.iterrows(): 2305 if not row["entityId"] and ( 2306 manifest_record_type == "file_and_entities" 2307 or manifest_record_type == "table_file_and_entities" 2308 ): 2309 manifest, entityId = self._create_entity_id( 2310 idx, row, manifest, datasetId 2311 ) 2312 elif not row["entityId"] and manifest_record_type == "table_and_file": 2313 # If not using entityIds, fill with manifest_table_id so 2314 row["entityId"] = manifest_synapse_table_id 2315 manifest.loc[idx, "entityId"] = manifest_synapse_table_id 2316 entityId = "" 2317 # If the row is the manifest table, do not add annotations 2318 elif row["entityId"] == manifest_synapse_table_id: 2319 entityId = "" 2320 else: 2321 # get the file id of the file to annotate, collected in above step. 2322 entityId = row["entityId"] 2323 2324 # Adding annotations to connected files. 2325 if entityId: 2326 # Format annotations for Synapse 2327 annos_task = asyncio.create_task( 2328 self.format_row_annotations( 2329 dmge, row, entityId, hideBlanks, annotation_keys 2330 ) 2331 ) 2332 requests.add(annos_task) 2333 await self._process_store_annos(requests) 2334 return manifest 2335 2336 @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_table") 2337 def upload_manifest_as_table( 2338 self, 2339 dmge: DataModelGraphExplorer, 2340 manifest: pd.DataFrame, 2341 metadataManifestPath: str, 2342 datasetId: str, 2343 table_name: str, 2344 component_name: str, 2345 restrict: bool, 2346 manifest_record_type: str, 2347 hideBlanks: bool, 2348 table_manipulation: str, 2349 table_column_names: str, 2350 annotation_keys: str, 2351 file_annotations_upload: bool = True, 2352 ): 2353 """Upload manifest to Synapse as a table and csv. 2354 Args: 2355 dmge: DataModelGraphExplorer object 2356 manifest (pd.DataFrame): loaded df containing user supplied data. 2357 metadataManifestPath: path to csv containing a validated metadata manifest. 2358 datasetId (str): synapse ID of folder containing the dataset 2359 table_name (str): Generated to name the table being uploaded. 2360 component_name (str): Name of the component manifest that is currently being uploaded. 2361 restrict (bool): Flag for censored data. 2362 manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2363 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2364 table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2365 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2366 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2367 display label formatting. 2368 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2369 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2370 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2371 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2372 Return: 2373 manifest_synapse_file_id: SynID of manifest csv uploaded to synapse. 2374 """ 2375 # Upload manifest as a table, get the ID and updated manifest. 2376 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2377 dmge=dmge, 2378 manifest=manifest, 2379 datasetId=datasetId, 2380 table_name=table_name, 2381 restrict=restrict, 2382 table_manipulation=table_manipulation, 2383 table_column_names=table_column_names, 2384 ) 2385 2386 if file_annotations_upload: 2387 manifest = asyncio.run( 2388 self.add_annotations_to_entities_files( 2389 dmge, 2390 manifest, 2391 manifest_record_type, 2392 datasetId, 2393 hideBlanks, 2394 manifest_synapse_table_id, 2395 annotation_keys, 2396 ) 2397 ) 2398 # Load manifest to synapse as a CSV File 2399 manifest_synapse_file_id = self.upload_manifest_file( 2400 manifest=manifest, 2401 metadataManifestPath=metadataManifestPath, 2402 datasetId=datasetId, 2403 restrict_manifest=restrict, 2404 component_name=component_name, 2405 ) 2406 2407 # Set annotations for the file manifest. 2408 manifest_annotations = self.format_manifest_annotations( 2409 manifest=manifest, manifest_synapse_id=manifest_synapse_file_id 2410 ) 2411 annos = self.syn.set_annotations(annotations=manifest_annotations) 2412 manifest_entity = self.synapse_entity_tracker.get( 2413 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2414 ) 2415 manifest_entity.annotations = annos 2416 manifest_entity.etag = annos.etag 2417 2418 logger.info("Associated manifest file with dataset on Synapse.") 2419 2420 # Update manifest Synapse table with new entity id column. 2421 manifest_synapse_table_id, manifest, _ = self.uploadDB( 2422 dmge=dmge, 2423 manifest=manifest, 2424 datasetId=datasetId, 2425 table_name=table_name, 2426 restrict=restrict, 2427 table_manipulation="update", 2428 table_column_names=table_column_names, 2429 ) 2430 2431 # Set annotations for the table manifest 2432 manifest_annotations = self.format_manifest_annotations( 2433 manifest=manifest, manifest_synapse_id=manifest_synapse_table_id 2434 ) 2435 annotations_manifest_table = self.syn.set_annotations( 2436 annotations=manifest_annotations 2437 ) 2438 manifest_table_entity = self.synapse_entity_tracker.get( 2439 synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False 2440 ) 2441 manifest_table_entity.annotations = annotations_manifest_table 2442 manifest_table_entity.etag = annotations_manifest_table.etag 2443 2444 return manifest_synapse_file_id 2445 2446 @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_csv") 2447 def upload_manifest_as_csv( 2448 self, 2449 dmge, 2450 manifest, 2451 metadataManifestPath, 2452 datasetId, 2453 restrict, 2454 manifest_record_type, 2455 hideBlanks, 2456 component_name, 2457 annotation_keys: str, 2458 file_annotations_upload: bool = True, 2459 ): 2460 """Upload manifest to Synapse as a csv only. 2461 Args: 2462 dmge: DataModelGraphExplorer object 2463 manifest (pd.DataFrame): loaded df containing user supplied data. 2464 metadataManifestPath: path to csv containing a validated metadata manifest. 2465 datasetId (str): synapse ID of folder containing the dataset 2466 restrict (bool): Flag for censored data. 2467 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2468 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2469 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2470 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2471 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2472 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2473 Return: 2474 manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. 2475 """ 2476 if file_annotations_upload: 2477 manifest = asyncio.run( 2478 self.add_annotations_to_entities_files( 2479 dmge, 2480 manifest, 2481 manifest_record_type, 2482 datasetId, 2483 hideBlanks, 2484 annotation_keys=annotation_keys, 2485 ) 2486 ) 2487 2488 # Load manifest to synapse as a CSV File 2489 manifest_synapse_file_id = self.upload_manifest_file( 2490 manifest, 2491 metadataManifestPath, 2492 datasetId, 2493 restrict, 2494 component_name=component_name, 2495 ) 2496 2497 # Set annotations for the file manifest. 2498 manifest_annotations = self.format_manifest_annotations( 2499 manifest, manifest_synapse_file_id 2500 ) 2501 annos = self.syn.set_annotations(manifest_annotations) 2502 manifest_entity = self.synapse_entity_tracker.get( 2503 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2504 ) 2505 manifest_entity.annotations = annos 2506 manifest_entity.etag = annos.etag 2507 2508 logger.info("Associated manifest file with dataset on Synapse.") 2509 2510 return manifest_synapse_file_id 2511 2512 @tracer.start_as_current_span("SynapseStorage::upload_manifest_combo") 2513 def upload_manifest_combo( 2514 self, 2515 dmge, 2516 manifest, 2517 metadataManifestPath, 2518 datasetId, 2519 table_name, 2520 component_name, 2521 restrict, 2522 manifest_record_type, 2523 hideBlanks, 2524 table_manipulation, 2525 table_column_names: str, 2526 annotation_keys: str, 2527 file_annotations_upload: bool = True, 2528 ): 2529 """Upload manifest to Synapse as a table and CSV with entities. 2530 Args: 2531 dmge: DataModelGraphExplorer object 2532 manifest (pd.DataFrame): loaded df containing user supplied data. 2533 metadataManifestPath: path to csv containing a validated metadata manifest. 2534 datasetId (str): synapse ID of folder containing the dataset 2535 table_name (str): Generated to name the table being uploaded. 2536 component_name (str): Name of the component manifest that is currently being uploaded. 2537 restrict (bool): Flag for censored data. 2538 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2539 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2540 table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2541 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2542 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2543 display label formatting. 2544 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2545 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2546 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2547 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2548 Return: 2549 manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. 2550 """ 2551 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2552 dmge=dmge, 2553 manifest=manifest, 2554 datasetId=datasetId, 2555 table_name=table_name, 2556 restrict=restrict, 2557 table_manipulation=table_manipulation, 2558 table_column_names=table_column_names, 2559 ) 2560 2561 if file_annotations_upload: 2562 manifest = asyncio.run( 2563 self.add_annotations_to_entities_files( 2564 dmge, 2565 manifest, 2566 manifest_record_type, 2567 datasetId, 2568 hideBlanks, 2569 manifest_synapse_table_id, 2570 annotation_keys=annotation_keys, 2571 ) 2572 ) 2573 2574 # Load manifest to synapse as a CSV File 2575 manifest_synapse_file_id = self.upload_manifest_file( 2576 manifest, metadataManifestPath, datasetId, restrict, component_name 2577 ) 2578 2579 # Set annotations for the file manifest. 2580 manifest_annotations = self.format_manifest_annotations( 2581 manifest, manifest_synapse_file_id 2582 ) 2583 file_manifest_annoations = self.syn.set_annotations(manifest_annotations) 2584 manifest_entity = self.synapse_entity_tracker.get( 2585 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2586 ) 2587 manifest_entity.annotations = file_manifest_annoations 2588 manifest_entity.etag = file_manifest_annoations.etag 2589 logger.info("Associated manifest file with dataset on Synapse.") 2590 2591 # Update manifest Synapse table with new entity id column. 2592 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2593 dmge=dmge, 2594 manifest=manifest, 2595 datasetId=datasetId, 2596 table_name=table_name, 2597 restrict=restrict, 2598 table_manipulation="update", 2599 table_column_names=table_column_names, 2600 ) 2601 2602 # Set annotations for the table manifest 2603 manifest_annotations = self.format_manifest_annotations( 2604 manifest, manifest_synapse_table_id 2605 ) 2606 table_manifest_annotations = self.syn.set_annotations(manifest_annotations) 2607 manifest_entity = self.synapse_entity_tracker.get( 2608 synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False 2609 ) 2610 manifest_entity.annotations = table_manifest_annotations 2611 manifest_entity.etag = table_manifest_annotations.etag 2612 return manifest_synapse_file_id 2613 2614 @tracer.start_as_current_span("SynapseStorage::associateMetadataWithFiles") 2615 def associateMetadataWithFiles( 2616 self, 2617 dmge: DataModelGraphExplorer, 2618 metadataManifestPath: str, 2619 datasetId: str, 2620 manifest_record_type: str = "table_file_and_entities", 2621 hideBlanks: bool = False, 2622 restrict_manifest=False, 2623 table_manipulation: str = "replace", 2624 table_column_names: str = "class_label", 2625 annotation_keys: str = "class_label", 2626 file_annotations_upload: bool = True, 2627 ) -> str: 2628 """Associate metadata with files in a storage dataset already on Synapse. 2629 Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file. 2630 2631 If this is a new manifest there could be no Synapse entities associated with the rows of this manifest 2632 this may be due to data type (e.g. clinical data) being tabular 2633 and not requiring files; to utilize uniform interfaces downstream 2634 (i.e. fileviews), a Synapse entity (a folder) is created for each row 2635 and an entity column is added to the manifest containing the resulting 2636 entity IDs; a table is also created at present as an additional interface 2637 for downstream query and interaction with the data. 2638 2639 Args: 2640 dmge: DataModelGraphExplorer Object 2641 metadataManifestPath: path to csv containing a validated metadata manifest. 2642 The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type. 2643 Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item. 2644 In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file. 2645 datasetId: synapse ID of folder containing the dataset 2646 manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination. 2647 hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2648 restrict_manifest (bool): Default is false. Flag for censored data. 2649 table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2650 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2651 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2652 display label formatting. 2653 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2654 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2655 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2656 Returns: 2657 manifest_synapse_file_id: SynID of manifest csv uploaded to synapse. 2658 """ 2659 # Read new manifest CSV: 2660 manifest = self._read_manifest(metadataManifestPath) 2661 manifest = self._add_id_columns_to_manifest(manifest, dmge) 2662 2663 table_name, component_name = self._generate_table_name(manifest) 2664 2665 # Upload manifest to synapse based on user input (manifest_record_type) 2666 if manifest_record_type == "file_only": 2667 manifest_synapse_file_id = self.upload_manifest_as_csv( 2668 dmge=dmge, 2669 manifest=manifest, 2670 metadataManifestPath=metadataManifestPath, 2671 datasetId=datasetId, 2672 restrict=restrict_manifest, 2673 hideBlanks=hideBlanks, 2674 manifest_record_type=manifest_record_type, 2675 component_name=component_name, 2676 annotation_keys=annotation_keys, 2677 file_annotations_upload=file_annotations_upload, 2678 ) 2679 elif manifest_record_type == "table_and_file": 2680 manifest_synapse_file_id = self.upload_manifest_as_table( 2681 dmge=dmge, 2682 manifest=manifest, 2683 metadataManifestPath=metadataManifestPath, 2684 datasetId=datasetId, 2685 table_name=table_name, 2686 component_name=component_name, 2687 restrict=restrict_manifest, 2688 hideBlanks=hideBlanks, 2689 manifest_record_type=manifest_record_type, 2690 table_manipulation=table_manipulation, 2691 table_column_names=table_column_names, 2692 annotation_keys=annotation_keys, 2693 file_annotations_upload=file_annotations_upload, 2694 ) 2695 elif manifest_record_type == "file_and_entities": 2696 manifest_synapse_file_id = self.upload_manifest_as_csv( 2697 dmge=dmge, 2698 manifest=manifest, 2699 metadataManifestPath=metadataManifestPath, 2700 datasetId=datasetId, 2701 restrict=restrict_manifest, 2702 hideBlanks=hideBlanks, 2703 manifest_record_type=manifest_record_type, 2704 component_name=component_name, 2705 annotation_keys=annotation_keys, 2706 file_annotations_upload=file_annotations_upload, 2707 ) 2708 elif manifest_record_type == "table_file_and_entities": 2709 manifest_synapse_file_id = self.upload_manifest_combo( 2710 dmge=dmge, 2711 manifest=manifest, 2712 metadataManifestPath=metadataManifestPath, 2713 datasetId=datasetId, 2714 table_name=table_name, 2715 component_name=component_name, 2716 restrict=restrict_manifest, 2717 hideBlanks=hideBlanks, 2718 manifest_record_type=manifest_record_type, 2719 table_manipulation=table_manipulation, 2720 table_column_names=table_column_names, 2721 annotation_keys=annotation_keys, 2722 file_annotations_upload=file_annotations_upload, 2723 ) 2724 else: 2725 raise ValueError("Please enter a valid manifest_record_type.") 2726 return manifest_synapse_file_id 2727 2728 def getTableAnnotations(self, table_id: str): 2729 """Generate dictionary of annotations for the given Synapse file. 2730 Synapse returns all custom annotations as lists since they 2731 can contain multiple values. In all cases, the values will 2732 be converted into strings and concatenated with ", ". 2733 2734 Args: 2735 fileId (str): Synapse ID for dataset file. 2736 2737 Returns: 2738 dict: Annotations as comma-separated strings. 2739 """ 2740 try: 2741 entity = self.synapse_entity_tracker.get( 2742 synapse_id=table_id, syn=self.syn, download_file=False 2743 ) 2744 is_table = entity.concreteType.endswith(".TableEntity") 2745 annotations_raw = entity.annotations 2746 except SynapseHTTPError: 2747 # If an error occurs with retrieving entity, skip it 2748 # This could be caused by a temporary file view that 2749 # was deleted since its ID was retrieved 2750 is_file, is_table = False, False 2751 2752 # Skip anything that isn't a file or folder 2753 if not (is_table): 2754 return None 2755 2756 annotations = self.getEntityAnnotations(table_id, entity, annotations_raw) 2757 2758 return annotations 2759 2760 def getFileAnnotations(self, fileId: str) -> Dict[str, str]: 2761 """Generate dictionary of annotations for the given Synapse file. 2762 Synapse returns all custom annotations as lists since they 2763 can contain multiple values. In all cases, the values will 2764 be converted into strings and concatenated with ", ". 2765 2766 Args: 2767 fileId (str): Synapse ID for dataset file. 2768 2769 Returns: 2770 dict: Annotations as comma-separated strings. 2771 """ 2772 2773 # Get entity metadata, including annotations 2774 try: 2775 entity = self.synapse_entity_tracker.get( 2776 synapse_id=fileId, syn=self.syn, download_file=False 2777 ) 2778 is_file = entity.concreteType.endswith(".FileEntity") 2779 is_folder = entity.concreteType.endswith(".Folder") 2780 annotations_raw = entity.annotations 2781 except SynapseHTTPError: 2782 # If an error occurs with retrieving entity, skip it 2783 # This could be caused by a temporary file view that 2784 # was deleted since its ID was retrieved 2785 is_file, is_folder = False, False 2786 2787 # Skip anything that isn't a file or folder 2788 if not (is_file or is_folder): 2789 return None 2790 2791 annotations = self.getEntityAnnotations(fileId, entity, annotations_raw) 2792 2793 return annotations 2794 2795 def getEntityAnnotations(self, fileId, entity, annotations_raw): 2796 # Extract annotations from their lists and stringify. For example: 2797 # {'YearofBirth': [1980], 'author': ['bruno', 'milen', 'sujay']} 2798 annotations = dict() 2799 for key, vals in annotations_raw.items(): 2800 if isinstance(vals, list) and len(vals) == 1: 2801 annotations[key] = str(vals[0]) 2802 else: 2803 annotations[key] = ", ".join(str(v) for v in vals) 2804 2805 # Add the file entity ID and eTag, which weren't lists 2806 assert fileId == entity.id, ( 2807 "For some reason, the Synapse ID in the response doesn't match" 2808 "the Synapse ID sent in the request (via synapseclient)." 2809 ) 2810 annotations["entityId"] = fileId 2811 annotations["eTag"] = entity.etag 2812 2813 return annotations 2814 2815 def getDatasetAnnotations( 2816 self, datasetId: str, fill_na: bool = True, force_batch: bool = False 2817 ) -> pd.DataFrame: 2818 """Generate table for annotations across all files in given dataset. 2819 2820 Args: 2821 datasetId (str): Synapse ID for dataset folder. 2822 fill_na (bool): Whether to replace missing values with 2823 blank strings. 2824 force_batch (bool): Whether to force the function to use 2825 the batch mode, which uses a file view to retrieve 2826 annotations for a given dataset. Default to False 2827 unless there are more than 50 files in the dataset. 2828 2829 Returns: 2830 pd.DataFrame: Table of annotations. 2831 """ 2832 # Get all files in given dataset 2833 dataset_files = self.getFilesInStorageDataset(datasetId) 2834 2835 # if there are no dataset files, there are no annotations 2836 # return None 2837 if not dataset_files: 2838 return pd.DataFrame() 2839 2840 dataset_files_map = dict(dataset_files) 2841 dataset_file_ids, _ = list(zip(*dataset_files)) 2842 2843 # Get annotations for each file from Step 1 2844 # Batch mode 2845 try_batch = len(dataset_files) >= 50 or force_batch 2846 if try_batch: 2847 try: 2848 logger.info("Trying batch mode for retrieving Synapse annotations") 2849 table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids) 2850 except (SynapseAuthenticationError, SynapseHTTPError, ValueError): 2851 logger.info( 2852 f"Unable to create a temporary file view bound to {datasetId}. " 2853 "Defaulting to slower iterative retrieval of annotations." 2854 ) 2855 # Default to the slower non-batch method 2856 logger.info("Batch mode failed (probably due to permission error)") 2857 try_batch = False 2858 2859 # Non-batch mode 2860 if not try_batch: 2861 logger.info("Using slower (non-batch) sequential mode") 2862 records = [self.getFileAnnotations(i) for i in dataset_file_ids] 2863 # Remove any annotations for non-file/folders (stored as None) 2864 records = filter(None, records) 2865 table = pd.DataFrame.from_records(records) 2866 2867 # Add filenames for the files that "survived" annotation retrieval 2868 filenames = [dataset_files_map[i] for i in table["entityId"]] 2869 2870 if "Filename" not in table.columns: 2871 table.insert(0, "Filename", filenames) 2872 2873 # Ensure that entityId and eTag are at the end 2874 entity_ids = table.pop("entityId") 2875 etags = table.pop("eTag") 2876 table.insert(len(table.columns), "entityId", entity_ids) 2877 table.insert(len(table.columns), "eTag", etags) 2878 2879 # Missing values are filled in with empty strings for Google Sheets 2880 if fill_na: 2881 table.fillna("", inplace=True) 2882 2883 # Force all values as strings 2884 return table.astype(str) 2885 2886 def raise_final_error(retry_state): 2887 return retry_state.outcome.result() 2888 2889 def checkIfinAssetView(self, syn_id) -> str: 2890 # get data in administrative fileview for this pipeline 2891 assetViewTable = self.getStorageFileviewTable() 2892 all_files = list(assetViewTable["id"]) 2893 if syn_id in all_files: 2894 return True 2895 else: 2896 return False 2897 2898 @tracer.start_as_current_span("SynapseStorage::getDatasetProject") 2899 @retry( 2900 stop=stop_after_attempt(5), 2901 wait=wait_chain( 2902 *[wait_fixed(10) for i in range(2)] 2903 + [wait_fixed(15) for i in range(2)] 2904 + [wait_fixed(20)] 2905 ), 2906 retry=retry_if_exception_type(LookupError), 2907 retry_error_callback=raise_final_error, 2908 ) 2909 def getDatasetProject(self, datasetId: str) -> str: 2910 """Get parent project for a given dataset ID. 2911 2912 Args: 2913 datasetId (str): Synapse entity ID (folder or project). 2914 2915 Raises: 2916 ValueError: Raised if Synapse ID cannot be retrieved 2917 by the user or if it doesn't appear in the file view. 2918 2919 Returns: 2920 str: The Synapse ID for the parent project. 2921 """ 2922 2923 # Subset main file view 2924 dataset_index = self.storageFileviewTable["id"] == datasetId 2925 dataset_row = self.storageFileviewTable[dataset_index] 2926 2927 # re-query if no datasets found 2928 if dataset_row.empty: 2929 sleep(5) 2930 self.query_fileview(force_requery=True) 2931 # Subset main file view 2932 dataset_index = self.storageFileviewTable["id"] == datasetId 2933 dataset_row = self.storageFileviewTable[dataset_index] 2934 2935 # Return `projectId` for given row if only one found 2936 if len(dataset_row) == 1: 2937 dataset_project = dataset_row["projectId"].values[0] 2938 return dataset_project 2939 2940 # Otherwise, check if already project itself 2941 try: 2942 syn_object = self.synapse_entity_tracker.get( 2943 synapse_id=datasetId, syn=self.syn, download_file=False 2944 ) 2945 if syn_object.properties["concreteType"].endswith("Project"): 2946 return datasetId 2947 except SynapseHTTPError: 2948 raise PermissionError( 2949 f"The given dataset ({datasetId}) isn't accessible with this " 2950 "user. This might be caused by a typo in the dataset Synapse ID." 2951 ) 2952 2953 # If not, then assume dataset not in file view 2954 raise LookupError( 2955 f"The given dataset ({datasetId}) doesn't appear in the " 2956 f"configured file view ({self.storageFileview}). This might " 2957 "mean that the file view's scope needs to be updated." 2958 ) 2959 2960 def getDatasetAnnotationsBatch( 2961 self, datasetId: str, dataset_file_ids: Sequence[str] = None 2962 ) -> pd.DataFrame: 2963 """Generate table for annotations across all files in given dataset. 2964 This function uses a temporary file view to generate a table 2965 instead of iteratively querying for individual entity annotations. 2966 This function is expected to run much faster than 2967 `self.getDatasetAnnotationsBatch` on large datasets. 2968 2969 Args: 2970 datasetId (str): Synapse ID for dataset folder. 2971 dataset_file_ids (Sequence[str]): List of Synapse IDs 2972 for dataset files/folders used to subset the table. 2973 2974 Returns: 2975 pd.DataFrame: Table of annotations. 2976 """ 2977 # Create data frame from annotations file view 2978 with DatasetFileView(datasetId, self.syn) as fileview: 2979 table = fileview.query() 2980 2981 if dataset_file_ids: 2982 table = table.loc[table.index.intersection(dataset_file_ids)] 2983 2984 table = table.reset_index(drop=True) 2985 2986 return table 2987 2988 def _get_table_schema_by_cname(self, table_schema): 2989 # assume no duplicate column names in the table 2990 table_schema_by_cname = {} 2991 2992 for col_record in table_schema: 2993 # TODO clean up dictionary for compactness (e.g. remove redundant 'name' key) 2994 table_schema_by_cname[col_record["name"]] = col_record 2995 2996 return table_schema_by_cname
Implementation of Storage interface for datasets/files stored on Synapse. Provides utilities to list files in a specific project; update files annotations, create fileviews, etc.
TODO: Need to define the interface and rename and/or refactor some of the methods below.
303 @tracer.start_as_current_span("SynapseStorage::__init__") 304 def __init__( 305 self, 306 token: Optional[str] = None, # optional parameter retrieved from browser cookie 307 access_token: Optional[str] = None, 308 project_scope: Optional[list] = None, 309 synapse_cache_path: Optional[str] = None, 310 perform_query: Optional[bool] = True, 311 columns: Optional[list] = None, 312 where_clauses: Optional[list] = None, 313 ) -> None: 314 """Initializes a SynapseStorage object. 315 316 Args: 317 token (Optional[str], optional): 318 Optional token parameter as found in browser cookie upon login to synapse. 319 Defaults to None. 320 access_token (Optional[list], optional): 321 Optional access token (personal or oauth). 322 Defaults to None. 323 project_scope (Optional[list], optional): Defaults to None. 324 synapse_cache_path (Optional[str], optional): 325 Location of synapse cache. 326 Defaults to None. 327 TODO: 328 Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how `query_fileview` is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands. 329 """ 330 self.syn = self.login(synapse_cache_path, access_token) 331 self.project_scope = project_scope 332 self.storageFileview = CONFIG.synapse_master_fileview_id 333 self.manifest = CONFIG.synapse_manifest_basename 334 self.root_synapse_cache = self.syn.cache.cache_root_dir 335 self.synapse_entity_tracker = SynapseEntityTracker() 336 if perform_query: 337 self.query_fileview(columns=columns, where_clauses=where_clauses)
Initializes a SynapseStorage object.
Arguments:
- token (Optional[str], optional): Optional token parameter as found in browser cookie upon login to synapse. Defaults to None.
- access_token (Optional[list], optional): Optional access token (personal or oauth). Defaults to None.
- project_scope (Optional[list], optional): Defaults to None.
- synapse_cache_path (Optional[str], optional): Location of synapse cache. Defaults to None.
TODO:
Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how
query_fileview
is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands.
376 @tracer.start_as_current_span("SynapseStorage::query_fileview") 377 def query_fileview( 378 self, 379 columns: Optional[list] = None, 380 where_clauses: Optional[list] = None, 381 force_requery: Optional[bool] = False, 382 ) -> None: 383 """ 384 Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute. 385 Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes. 386 Args: 387 columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns. 388 where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None. 389 force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False. 390 """ 391 self._purge_synapse_cache() 392 393 # Initialize to assume that the new fileview query will be different from what may already be stored. Initializes to True because generally one will not have already been performed 394 self.new_query_different = True 395 396 # If a query has already been performed, store the query 397 previous_query_built = hasattr(self, "fileview_query") 398 if previous_query_built: 399 previous_query = self.fileview_query 400 401 # Build a query with the current given parameters and check to see if it is different from the previous 402 self._build_query(columns=columns, where_clauses=where_clauses) 403 if previous_query_built: 404 self.new_query_different = self.fileview_query != previous_query 405 406 # Only perform the query if it is different from the previous query or we are forcing new results to be retrieved 407 if self.new_query_different or force_requery: 408 try: 409 self.storageFileviewTable = self.syn.tableQuery( 410 query=self.fileview_query, 411 ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) 412 except SynapseHTTPError as exc: 413 exception_text = str(exc) 414 if "Unknown column path" in exception_text: 415 raise ValueError( 416 "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation." 417 ) 418 elif "Unknown column" in exception_text: 419 missing_column = exception_text.split("Unknown column ")[-1] 420 raise ValueError( 421 f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview." 422 ) 423 else: 424 raise AccessCredentialsError(self.storageFileview)
Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute. Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes.
Arguments:
- columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
- where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
- force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False.
426 @staticmethod 427 def build_clause_from_dataset_id( 428 dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None 429 ) -> str: 430 """ 431 Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized. 432 Args: 433 dataset_id: Synapse ID of a dataset that should be used to limit the query 434 dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query 435 Returns: 436 clause for the query or an empty string if no dataset ID is provided 437 """ 438 # Calling this method without specifying synIDs will complete but will not scope the view 439 if (not dataset_id) and (not dataset_folder_list): 440 return "" 441 442 # This will be used to gather files under a dataset recursively with a fileview query instead of walking 443 if dataset_folder_list: 444 search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list) 445 return f"parentId IN ({search_folders})" 446 447 # `dataset_id` should be provided when all files are stored directly under the dataset folder 448 return f"parentId='{dataset_id}'"
Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized.
Arguments:
- dataset_id: Synapse ID of a dataset that should be used to limit the query
- dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query
Returns:
clause for the query or an empty string if no dataset ID is provided
488 @staticmethod 489 @tracer.start_as_current_span("SynapseStorage::login") 490 def login( 491 synapse_cache_path: Optional[str] = None, 492 access_token: Optional[str] = None, 493 ) -> synapseclient.Synapse: 494 """Login to Synapse 495 496 Args: 497 access_token (Optional[str], optional): A synapse access token. Defaults to None. 498 synapse_cache_path (Optional[str]): location of synapse cache 499 500 Raises: 501 ValueError: If unable to loging with access token 502 503 Returns: 504 synapseclient.Synapse: A Synapse object that is logged in 505 """ 506 if not access_token: 507 access_token = os.getenv("SYNAPSE_ACCESS_TOKEN") 508 509 # login using a token 510 if access_token: 511 try: 512 syn = synapseclient.Synapse( 513 cache_root_dir=synapse_cache_path, 514 debug=False, 515 skip_checks=True, 516 cache_client=False, 517 ) 518 syn.login(authToken=access_token, silent=True) 519 except SynapseHTTPError as exc: 520 raise ValueError( 521 "No access to resources. Please make sure that your token is correct" 522 ) from exc 523 else: 524 # login using synapse credentials provided by user in .synapseConfig (default) file 525 syn = synapseclient.Synapse( 526 configPath=CONFIG.synapse_configuration_path, 527 cache_root_dir=synapse_cache_path, 528 debug=False, 529 skip_checks=True, 530 cache_client=False, 531 ) 532 syn.login(silent=True) 533 534 # set user id attribute 535 current_span = trace.get_current_span() 536 if current_span.is_recording(): 537 current_span.set_attribute("user.id", syn.credentials.owner_id) 538 539 return syn
Login to Synapse
Arguments:
- access_token (Optional[str], optional): A synapse access token. Defaults to None.
- synapse_cache_path (Optional[str]): location of synapse cache
Raises:
- ValueError: If unable to loging with access token
Returns:
synapseclient.Synapse: A Synapse object that is logged in
541 def missing_entity_handler(method): 542 def wrapper(*args, **kwargs): 543 try: 544 return method(*args, **kwargs) 545 except SynapseHTTPError as ex: 546 str_message = str(ex).replace("\n", "") 547 if "trash" in str_message or "does not exist" in str_message: 548 logging.warning(str_message) 549 return None 550 else: 551 raise ex 552 553 return wrapper
555 def async_missing_entity_handler(method): 556 """Decorator to handle missing entities in async methods.""" 557 558 async def wrapper(*args: Any, **kwargs: Any) -> Any: 559 try: 560 return await method(*args, **kwargs) 561 except SynapseHTTPError as ex: 562 str_message = str(ex).replace("\n", "") 563 if "trash" in str_message or "does not exist" in str_message: 564 logging.warning(str_message) 565 return None 566 else: 567 raise ex 568 569 return wrapper
Decorator to handle missing entities in async methods.
571 def getStorageFileviewTable(self): 572 """Returns the storageFileviewTable obtained during initialization.""" 573 return self.storageFileviewTable
Returns the storageFileviewTable obtained during initialization.
575 def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]: 576 """Gets the paginated results of the REST call to Synapse to check what projects the current user has access to. 577 578 Args: 579 currentUserId: synapse id for the user whose projects we want to get. 580 581 Returns: 582 A dictionary with a next page token and the results. 583 """ 584 all_results = self.syn.restGET( 585 "/projects/user/{principalId}".format(principalId=currentUserId) 586 ) 587 588 while ( 589 "nextPageToken" in all_results 590 ): # iterate over next page token in results while there is any 591 results_token = self.syn.restGET( 592 "/projects/user/{principalId}?nextPageToken={nextPageToken}".format( 593 principalId=currentUserId, 594 nextPageToken=all_results["nextPageToken"], 595 ) 596 ) 597 all_results["results"].extend(results_token["results"]) 598 599 if "nextPageToken" in results_token: 600 all_results["nextPageToken"] = results_token["nextPageToken"] 601 else: 602 del all_results["nextPageToken"] 603 604 return all_results
Gets the paginated results of the REST call to Synapse to check what projects the current user has access to.
Arguments:
- currentUserId: synapse id for the user whose projects we want to get.
Returns:
A dictionary with a next page token and the results.
606 @tracer.start_as_current_span("SynapseStorage::getStorageProjects") 607 def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]: 608 """Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute. 609 610 Returns: 611 A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName). 612 """ 613 614 # get the set of all storage Synapse project accessible for this pipeline 615 storageProjects = self.storageFileviewTable["projectId"].unique() 616 617 # get the set of storage Synapse project accessible for this user 618 # get a list of projects from Synapse 619 current_user_project_headers = self.synapse_entity_tracker.get_project_headers( 620 current_user_id=self.syn.credentials.owner_id, syn=self.syn 621 ) 622 project_id_to_name_dict = {} 623 current_user_projects = [] 624 for project_header in current_user_project_headers: 625 project_id_to_name_dict[project_header.get("id")] = project_header.get( 626 "name" 627 ) 628 current_user_projects.append(project_header.get("id")) 629 630 # find set of user projects that are also in this pipeline's storage projects set 631 storageProjects = list(set(storageProjects) & set(current_user_projects)) 632 633 # Limit projects to scope if specified 634 if project_scope: 635 storageProjects = list(set(storageProjects) & set(project_scope)) 636 637 if not storageProjects: 638 raise Warning( 639 f"There are no projects that the user has access to that match the criteria of the specified project scope: {project_scope}" 640 ) 641 642 # prepare a return list of project IDs and names 643 projects = [] 644 for projectId in storageProjects: 645 project_name_from_project_header = project_id_to_name_dict.get(projectId) 646 projects.append((projectId, project_name_from_project_header)) 647 648 sorted_projects_list = sorted(projects, key=lambda tup: tup[0]) 649 650 return sorted_projects_list
Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute.
Returns:
A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName).
652 @tracer.start_as_current_span("SynapseStorage::getStorageDatasetsInProject") 653 def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]: 654 """Gets all datasets in folder under a given storage project that the current user has access to. 655 656 Args: 657 projectId: synapse ID of a storage project. 658 659 Returns: 660 A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName). 661 None: If the projectId cannot be found on Synapse. 662 """ 663 664 # select all folders and fetch their names from within the storage project; 665 # if folder content type is defined, only select folders that contain datasets 666 if "contentType" in self.storageFileviewTable.columns: 667 foldersTable = self.storageFileviewTable[ 668 (self.storageFileviewTable["contentType"] == "dataset") 669 & (self.storageFileviewTable["projectId"] == projectId) 670 ] 671 else: 672 foldersTable = self.storageFileviewTable[ 673 (self.storageFileviewTable["type"] == "folder") 674 & (self.storageFileviewTable["parentId"] == projectId) 675 ] 676 677 # get an array of tuples (folderId, folderName) 678 # some folders are part of datasets; others contain datasets 679 # each dataset parent is the project; folders part of a dataset have another folder as a parent 680 # to get folders if and only if they contain datasets for each folder 681 # check if folder's parent is the project; if so that folder contains a dataset, 682 # unless the folder list has already been filtered to dataset folders based on contentType attribute above 683 684 datasetList = [] 685 folderProperties = ["id", "name"] 686 for folder in list( 687 foldersTable[folderProperties].itertuples(index=False, name=None) 688 ): 689 datasetList.append(folder) 690 691 sorted_dataset_list = sorted(datasetList, key=lambda tup: tup[0]) 692 693 return sorted_dataset_list
Gets all datasets in folder under a given storage project that the current user has access to.
Arguments:
- projectId: synapse ID of a storage project.
Returns:
A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName). None: If the projectId cannot be found on Synapse.
695 @tracer.start_as_current_span("SynapseStorage::getFilesInStorageDataset") 696 def getFilesInStorageDataset( 697 self, datasetId: str, fileNames: List = None, fullpath: bool = True 698 ) -> List[Tuple[str, str]]: 699 """Gets all files (excluding manifest files) in a given dataset folder. 700 701 Args: 702 datasetId: synapse ID of a storage dataset. 703 fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g. 704 metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present. 705 fullpath: if True return the full path as part of this filename; otherwise return just base filename 706 707 Returns: 708 A list of files; the list consists of tuples (fileId, fileName). 709 710 Raises: 711 ValueError: Dataset ID not found. 712 """ 713 file_list = [] 714 715 # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view 716 if self.storageFileviewTable.empty: 717 raise ValueError( 718 f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again." 719 ) 720 child_path = self.storageFileviewTable.loc[ 721 self.storageFileviewTable["parentId"] == datasetId, "path" 722 ] 723 if child_path.empty: 724 raise LookupError( 725 f"Dataset {datasetId} could not be found in fileview {self.storageFileview}." 726 ) 727 child_path = child_path.iloc[0] 728 729 # Get the dataset path by eliminating the child's portion of the path to account for nested datasets 730 parent = child_path.split("/")[:-1] 731 parent = "/".join(parent) 732 733 # When querying, only include files to exclude entity files and subdirectories 734 where_clauses = [create_like_statement(parent), "type='file'"] 735 736 # Requery the fileview to specifically get the files in the given dataset 737 self.query_fileview(columns=["id", "path"], where_clauses=where_clauses) 738 739 # Exclude manifest files 740 non_manifest_files = self.storageFileviewTable.loc[ 741 ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"), 742 :, 743 ] 744 745 # Remove all files that are not in the list of fileNames 746 if fileNames: 747 filename_regex = "|".join(fileNames) 748 749 matching_files = non_manifest_files["path"].str.contains( 750 filename_regex, case=False, regex=True 751 ) 752 753 non_manifest_files = non_manifest_files.loc[matching_files, :] 754 755 # Truncate path if necessary 756 if not fullpath: 757 non_manifest_files.path = non_manifest_files.path.apply(os.path.basename) 758 759 # Return list of files as expected by other methods 760 file_list = list(non_manifest_files.itertuples(index=False, name=None)) 761 762 return file_list
Gets all files (excluding manifest files) in a given dataset folder.
Arguments:
- datasetId: synapse ID of a storage dataset.
- fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g.
- metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present.
- fullpath: if True return the full path as part of this filename; otherwise return just base filename
Returns:
A list of files; the list consists of tuples (fileId, fileName).
Raises:
- ValueError: Dataset ID not found.
789 @tracer.start_as_current_span("SynapseStorage::getDatasetManifest") 790 def getDatasetManifest( 791 self, 792 datasetId: str, 793 downloadFile: bool = False, 794 newManifestName: str = "", 795 use_temporary_folder: bool = True, 796 ) -> Union[str, File]: 797 """Gets the manifest associated with a given dataset. 798 799 Args: 800 datasetId: synapse ID of a storage dataset. 801 downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not. 802 newManifestName: new name of a manifest that gets downloaded 803 use_temporary_folder: boolean argument indicating if a temporary folder 804 should be used to store the manifest file. This is useful when running 805 this code as an API server where multiple requests could be made at the 806 same time. This is set to False when the code is being used from the 807 CLI. Defaults to True. 808 809 Returns: 810 manifest_syn_id (String): Synapse ID of exisiting manifest file. 811 manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True. 812 "" (String): No pre-exisiting manifest in dataset. 813 """ 814 manifest_data = "" 815 816 # get a list of files containing the manifest for this dataset (if any) 817 all_files = self.storageFileviewTable 818 819 # construct regex based on manifest basename in the config 820 manifest_re = re.compile(os.path.basename(self.manifest) + ".*.[tc]sv") 821 822 # search manifest based on given manifest basename regex above 823 # and return a dataframe containing name and id of manifests in a given asset view 824 manifest = all_files[ 825 (all_files["name"].str.contains(manifest_re, regex=True)) 826 & (all_files["parentId"] == datasetId) 827 ] 828 829 manifest = manifest[["id", "name"]] 830 831 # if there is no pre-exisiting manifest in the specified dataset 832 if manifest.empty: 833 logger.warning( 834 f"Could not find a manifest that fits basename {self.manifest} in asset view and dataset {datasetId}" 835 ) 836 return "" 837 838 # if there is an exisiting manifest 839 else: 840 manifest_syn_id = self._get_manifest_id(manifest) 841 if downloadFile: 842 md = ManifestDownload( 843 self.syn, 844 manifest_id=manifest_syn_id, 845 synapse_entity_tracker=self.synapse_entity_tracker, 846 ) 847 manifest_data = md.download_manifest( 848 newManifestName=newManifestName, 849 manifest_df=manifest, 850 use_temporary_folder=use_temporary_folder, 851 ) 852 # TO DO: revisit how downstream code handle manifest_data. If the downstream code would break when manifest_data is an empty string, 853 # then we should catch the error here without returning an empty string. 854 if not manifest_data: 855 logger.debug( 856 f"No manifest data returned. Please check if you have successfully downloaded manifest: {manifest_syn_id}" 857 ) 858 return manifest_data 859 return manifest_syn_id
Gets the manifest associated with a given dataset.
Arguments:
- datasetId: synapse ID of a storage dataset.
- downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not.
- newManifestName: new name of a manifest that gets downloaded
- use_temporary_folder: boolean argument indicating if a temporary folder should be used to store the manifest file. This is useful when running this code as an API server where multiple requests could be made at the same time. This is set to False when the code is being used from the CLI. Defaults to True.
Returns:
manifest_syn_id (String): Synapse ID of exisiting manifest file. manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True. "" (String): No pre-exisiting manifest in dataset.
861 def getDataTypeFromManifest(self, manifestId: str): 862 """Fetch a manifest and return data types of all columns 863 Args: 864 manifestId: synapse ID of a manifest 865 """ 866 # get manifest file path 867 manifest_entity = self.synapse_entity_tracker.get( 868 synapse_id=manifestId, syn=self.syn, download_file=True 869 ) 870 manifest_filepath = manifest_entity.path 871 872 # load manifest dataframe 873 manifest = load_df( 874 manifest_filepath, 875 preserve_raw_input=False, 876 data_model=False, 877 ) 878 879 # convert the dataFrame to use best possible dtypes. 880 manifest_new = manifest.convert_dtypes() 881 882 # get data types of columns 883 result = manifest_new.dtypes.to_frame("dtypes").reset_index() 884 885 # return the result as a dictionary 886 result_dict = result.set_index("index")["dtypes"].astype(str).to_dict() 887 888 return result_dict
Fetch a manifest and return data types of all columns
Arguments:
- manifestId: synapse ID of a manifest
912 def add_entity_id_and_filename( 913 self, datasetId: str, manifest: pd.DataFrame 914 ) -> pd.DataFrame: 915 """add entityid and filename column to an existing manifest assuming entityId column is not already present 916 917 Args: 918 datasetId (str): dataset syn id 919 manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty 920 921 Returns: 922 pd.DataFrame: returns a pandas dataframe 923 """ 924 # get file names and entity ids of a given dataset 925 dataset_files_dict = self._get_files_metadata_from_dataset( 926 datasetId, only_new_files=False 927 ) 928 929 if dataset_files_dict: 930 # turn manifest dataframe back to a dictionary for operation 931 manifest_dict = manifest.to_dict("list") 932 933 # update Filename column 934 # add entityId column to the end 935 manifest_dict.update(dataset_files_dict) 936 937 # if the component column exists in existing manifest, fill up that column 938 if "Component" in manifest_dict.keys(): 939 manifest_dict["Component"] = manifest_dict["Component"] * max( 940 1, len(manifest_dict["Filename"]) 941 ) 942 943 # turn dictionary back to a dataframe 944 manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient="index") 945 manifest_df_updated = manifest_df_index.transpose() 946 947 # fill na with empty string 948 manifest_df_updated = manifest_df_updated.fillna("") 949 950 # drop index 951 manifest_df_updated = manifest_df_updated.reset_index(drop=True) 952 953 return manifest_df_updated 954 else: 955 return manifest
add entityid and filename column to an existing manifest assuming entityId column is not already present
Arguments:
- datasetId (str): dataset syn id
- manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty
Returns:
pd.DataFrame: returns a pandas dataframe
957 def fill_in_entity_id_filename( 958 self, datasetId: str, manifest: pd.DataFrame 959 ) -> Tuple[List, pd.DataFrame]: 960 """fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present. 961 962 Args: 963 datasetId (str): dataset syn id 964 manifest (pd.DataFrame): existing manifest dataframe. 965 966 Returns: 967 Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe 968 """ 969 # get dataset file names and entity id as a list of tuple 970 dataset_files = self.getFilesInStorageDataset(datasetId) 971 972 # update manifest with additional filenames, if any 973 # note that if there is an existing manifest and there are files in the dataset 974 # the columns Filename and entityId are assumed to be present in manifest schema 975 # TODO: use idiomatic panda syntax 976 if not dataset_files: 977 manifest = manifest.fillna("") 978 return dataset_files, manifest 979 980 all_files = self._get_file_entityIds( 981 dataset_files=dataset_files, only_new_files=False, manifest=manifest 982 ) 983 new_files = self._get_file_entityIds( 984 dataset_files=dataset_files, only_new_files=True, manifest=manifest 985 ) 986 987 all_files = pd.DataFrame(all_files) 988 new_files = pd.DataFrame(new_files) 989 990 # update manifest so that it contains new dataset files 991 manifest = ( 992 pd.concat([manifest, new_files], sort=False) 993 .reset_index() 994 .drop("index", axis=1) 995 ) 996 997 # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata 998 manifest_reindex = manifest.set_index("entityId") 999 all_files_reindex = all_files.set_index("entityId") 1000 all_files_reindex_like_manifest = all_files_reindex.reindex_like( 1001 manifest_reindex 1002 ) 1003 1004 # Check if individual file paths in manifest and from synapse match 1005 file_paths_match = ( 1006 manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"] 1007 ) 1008 1009 # If all the paths do not match, update the manifest with the filepaths from synapse 1010 if not file_paths_match.all(): 1011 manifest_reindex.loc[ 1012 ~file_paths_match, "Filename" 1013 ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"] 1014 1015 # reformat manifest for further use 1016 manifest = manifest_reindex.reset_index() 1017 entityIdCol = manifest.pop("entityId") 1018 manifest.insert(len(manifest.columns), "entityId", entityIdCol) 1019 1020 manifest = manifest.fillna("") 1021 return dataset_files, manifest
fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present.
Arguments:
- datasetId (str): dataset syn id
- manifest (pd.DataFrame): existing manifest dataframe.
Returns:
Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe
1023 @tracer.start_as_current_span("SynapseStorage::updateDatasetManifestFiles") 1024 def updateDatasetManifestFiles( 1025 self, dmge: DataModelGraphExplorer, datasetId: str, store: bool = True 1026 ) -> Union[Tuple[str, pd.DataFrame], None]: 1027 """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any. 1028 1029 Args: 1030 dmge: DataModelGraphExplorer Instance 1031 datasetId: synapse ID of a storage dataset. 1032 store: if set to True store updated manifest in asset store; if set to False 1033 return a Pandas dataframe containing updated manifest but do not store to asset store 1034 1035 1036 Returns: 1037 Synapse ID of updated manifest and Pandas dataframe containing the updated manifest. 1038 If there is no existing manifest or if the manifest does not have an entityId column, return None 1039 """ 1040 1041 # get existing manifest Synapse ID 1042 manifest_id = self.getDatasetManifest(datasetId) 1043 1044 # if there is no manifest return None 1045 if not manifest_id: 1046 return None 1047 1048 manifest_entity = self.synapse_entity_tracker.get( 1049 synapse_id=manifest_id, syn=self.syn, download_file=True 1050 ) 1051 manifest_filepath = manifest_entity.path 1052 manifest = load_df(manifest_filepath) 1053 1054 # If the manifest does not have an entityId column, trigger a new manifest to be generated 1055 if "entityId" not in manifest.columns: 1056 return None 1057 1058 manifest_is_file_based = "Filename" in manifest.columns 1059 1060 if manifest_is_file_based: 1061 # update manifest with additional filenames, if any 1062 # note that if there is an existing manifest and there are files in the dataset 1063 # the columns Filename and entityId are assumed to be present in manifest schema 1064 # TODO: use idiomatic panda syntax 1065 dataset_files, manifest = self.fill_in_entity_id_filename( 1066 datasetId, manifest 1067 ) 1068 if dataset_files: 1069 # update the manifest file, so that it contains the relevant entity IDs 1070 if store: 1071 manifest.to_csv(manifest_filepath, index=False) 1072 1073 # store manifest and update associated metadata with manifest on Synapse 1074 manifest_id = self.associateMetadataWithFiles( 1075 dmge, manifest_filepath, datasetId 1076 ) 1077 1078 return manifest_id, manifest
Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any.
Arguments:
- dmge: DataModelGraphExplorer Instance
- datasetId: synapse ID of a storage dataset.
- store: if set to True store updated manifest in asset store; if set to False
- return a Pandas dataframe containing updated manifest but do not store to asset store
Returns:
Synapse ID of updated manifest and Pandas dataframe containing the updated manifest. If there is no existing manifest or if the manifest does not have an entityId column, return None
1124 @tracer.start_as_current_span("SynapseStorage::getProjectManifests") 1125 def getProjectManifests( 1126 self, projectId: str 1127 ) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]: 1128 """Gets all metadata manifest files across all datasets in a specified project. 1129 1130 Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest 1131 as a list of tuples, one for each manifest: 1132 [ 1133 ( 1134 (datasetId, dataName), 1135 (manifestId, manifestName), 1136 (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema 1137 ), 1138 ... 1139 ] 1140 1141 TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface 1142 """ 1143 component = None 1144 entity = None 1145 manifests = [] 1146 1147 datasets = self.getStorageDatasetsInProject(projectId) 1148 1149 for datasetId, datasetName in datasets: 1150 # encode information about the manifest in a simple list (so that R clients can unpack it) 1151 # eventually can serialize differently 1152 1153 # Get synID of manifest for a dataset 1154 manifestId = self.getDatasetManifest(datasetId) 1155 1156 # If a manifest exists, get the annotations for it, else return base 'manifest' tuple 1157 if manifestId: 1158 annotations = self.getFileAnnotations(manifestId) 1159 1160 # If manifest has annotations specifying component, use that 1161 if annotations and "Component" in annotations: 1162 component = annotations["Component"] 1163 entity = self.synapse_entity_tracker.get( 1164 synapse_id=manifestId, syn=self.syn, download_file=False 1165 ) 1166 manifest_name = entity["properties"]["name"] 1167 1168 # otherwise download the manifest and parse for information 1169 elif not annotations or "Component" not in annotations: 1170 logging.debug( 1171 f"No component annotations have been found for manifest {manifestId}. " 1172 "The manifest will be downloaded and parsed instead. " 1173 "For increased speed, add component annotations to manifest." 1174 ) 1175 1176 manifest_info = self.getDatasetManifest( 1177 datasetId, downloadFile=True 1178 ) 1179 manifest_name = manifest_info["properties"].get("name", "") 1180 1181 if not manifest_name: 1182 logger.error(f"Failed to download manifests from {datasetId}") 1183 1184 manifest_path = manifest_info["path"] 1185 1186 manifest_df = load_df(manifest_path) 1187 1188 # Get component from component column if it exists 1189 if ( 1190 "Component" in manifest_df 1191 and not manifest_df["Component"].empty 1192 ): 1193 list(set(manifest_df["Component"])) 1194 component = list(set(manifest_df["Component"])) 1195 1196 # Added to address issues raised during DCA testing 1197 if "" in component: 1198 component.remove("") 1199 1200 if len(component) == 1: 1201 component = component[0] 1202 elif len(component) > 1: 1203 logging.warning( 1204 f"Manifest {manifestId} is composed of multiple components. Schematic does not support mulit-component manifests at this time." 1205 "Behavior of manifests with multiple components is undefined" 1206 ) 1207 else: 1208 manifest_name = "" 1209 component = None 1210 if component: 1211 manifest = ( 1212 (datasetId, datasetName), 1213 (manifestId, manifest_name), 1214 (component, component), 1215 ) 1216 elif manifestId: 1217 logging.debug( 1218 f"Manifest {manifestId} does not have an associated Component" 1219 ) 1220 manifest = ( 1221 (datasetId, datasetName), 1222 (manifestId, manifest_name), 1223 ("", ""), 1224 ) 1225 else: 1226 manifest = ( 1227 (datasetId, datasetName), 1228 ("", ""), 1229 ("", ""), 1230 ) 1231 1232 if manifest: 1233 manifests.append(manifest) 1234 1235 return manifests
Gets all metadata manifest files across all datasets in a specified project.
Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest as a list of tuples, one for each manifest: [ ( (datasetId, dataName), (manifestId, manifestName), (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema ), ... ]
TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface
1237 def upload_project_manifests_to_synapse( 1238 self, dmge: DataModelGraphExplorer, projectId: str 1239 ) -> List[str]: 1240 """Upload all metadata manifest files across all datasets in a specified project as tables in Synapse. 1241 1242 Returns: String of all the manifest_table_ids of all the manifests that have been loaded. 1243 """ 1244 1245 manifests = [] 1246 manifest_loaded = [] 1247 datasets = self.getStorageDatasetsInProject(projectId) 1248 1249 for datasetId, datasetName in datasets: 1250 # encode information about the manifest in a simple list (so that R clients can unpack it) 1251 # eventually can serialize differently 1252 1253 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1254 1255 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1256 if manifest_info: 1257 manifest_id = manifest_info["properties"]["id"] 1258 manifest_name = manifest_info["properties"]["name"] 1259 manifest_path = manifest_info["path"] 1260 manifest_df = load_df(manifest_path) 1261 manifest_table_id = uploadDB( 1262 dmge=dmge, 1263 manifest=manifest, 1264 datasetId=datasetId, 1265 table_name=datasetName, 1266 ) 1267 manifest_loaded.append(datasetName) 1268 return manifest_loaded
Upload all metadata manifest files across all datasets in a specified project as tables in Synapse.
Returns: String of all the manifest_table_ids of all the manifests that have been loaded.
1270 def upload_annotated_project_manifests_to_synapse( 1271 self, projectId: str, path_to_json_ld: str, dry_run: bool = False 1272 ) -> List[str]: 1273 """ 1274 Purpose: 1275 For all manifests in a project, upload them as a table and add annotations manifest csv. 1276 Assumes the manifest is already present as a CSV in a dataset in the project. 1277 1278 """ 1279 # Instantiate DataModelParser 1280 data_model_parser = DataModelParser(path_to_data_model=path_to_json_ld) 1281 # Parse Model 1282 parsed_data_model = data_model_parser.parse_model() 1283 1284 # Instantiate DataModelGraph 1285 data_model_grapher = DataModelGraph(parsed_data_model) 1286 1287 # Generate graph 1288 graph_data_model = data_model_grapher.generate_data_model_graph() 1289 1290 # Instantiate DataModelGraphExplorer 1291 dmge = DataModelGraphExplorer(graph_data_model) 1292 1293 manifests = [] 1294 manifest_loaded = [] 1295 datasets = self.getStorageDatasetsInProject(projectId) 1296 for datasetId, datasetName in datasets: 1297 # encode information about the manifest in a simple list (so that R clients can unpack it) 1298 # eventually can serialize differently 1299 1300 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1301 manifests.append(manifest) 1302 1303 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1304 1305 if manifest_info: 1306 manifest_id = manifest_info["properties"]["id"] 1307 manifest_name = manifest_info["properties"]["name"] 1308 manifest_path = manifest_info["path"] 1309 manifest = ( 1310 (datasetId, datasetName), 1311 (manifest_id, manifest_name), 1312 ("", ""), 1313 ) 1314 if not dry_run: 1315 self.associateMetadataWithFiles( 1316 dmge, manifest_path, datasetId, manifest_record_type="table" 1317 ) 1318 manifest_loaded.append(manifest) 1319 1320 return manifests, manifest_loaded
Purpose:
For all manifests in a project, upload them as a table and add annotations manifest csv. Assumes the manifest is already present as a CSV in a dataset in the project.
1322 def move_entities_to_new_project( 1323 self, 1324 projectId: str, 1325 newProjectId: str, 1326 returnEntities: bool = False, 1327 dry_run: bool = False, 1328 ): 1329 """ 1330 For each manifest csv in a project, look for all the entitiy ids that are associated. 1331 Look up the entitiy in the files, move the entity to new project. 1332 """ 1333 1334 manifests = [] 1335 manifest_loaded = [] 1336 datasets = self.getStorageDatasetsInProject(projectId) 1337 if datasets: 1338 for datasetId, datasetName in datasets: 1339 # encode information about the manifest in a simple list (so that R clients can unpack it) 1340 # eventually can serialize differently 1341 1342 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1343 manifests.append(manifest) 1344 1345 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1346 if manifest_info: 1347 manifest_id = manifest_info["properties"]["id"] 1348 manifest_name = manifest_info["properties"]["name"] 1349 manifest_path = manifest_info["path"] 1350 manifest_df = load_df(manifest_path) 1351 1352 manifest = ( 1353 (datasetId, datasetName), 1354 (manifest_id, manifest_name), 1355 ("", ""), 1356 ) 1357 manifest_loaded.append(manifest) 1358 1359 annotation_entities = self.storageFileviewTable[ 1360 (self.storageFileviewTable["id"].isin(manifest_df["entityId"])) 1361 & (self.storageFileviewTable["type"] == "folder") 1362 ]["id"] 1363 1364 if returnEntities: 1365 for entityId in annotation_entities: 1366 if not dry_run: 1367 moved_entity = self.syn.move(entityId, datasetId) 1368 self.synapse_entity_tracker.add( 1369 synapse_id=moved_entity.id, entity=moved_entity 1370 ) 1371 else: 1372 logging.info( 1373 f"{entityId} will be moved to folder {datasetId}." 1374 ) 1375 else: 1376 # generate project folder 1377 archive_project_folder = Folder( 1378 projectId + "_archive", parent=newProjectId 1379 ) 1380 archive_project_folder = self.syn.store(archive_project_folder) 1381 self.synapse_entity_tracker.add( 1382 synapse_id=archive_project_folder.id, 1383 entity=archive_project_folder, 1384 ) 1385 1386 # generate dataset folder 1387 dataset_archive_folder = Folder( 1388 "_".join([datasetId, datasetName, "archive"]), 1389 parent=archive_project_folder.id, 1390 ) 1391 dataset_archive_folder = self.syn.store(dataset_archive_folder) 1392 self.synapse_entity_tracker.add( 1393 synapse_id=dataset_archive_folder.id, 1394 entity=dataset_archive_folder, 1395 ) 1396 1397 for entityId in annotation_entities: 1398 # move entities to folder 1399 if not dry_run: 1400 moved_entity = self.syn.move( 1401 entityId, dataset_archive_folder.id 1402 ) 1403 self.synapse_entity_tracker.add( 1404 synapse_id=moved_entity.id, entity=moved_entity 1405 ) 1406 else: 1407 logging.info( 1408 f"{entityId} will be moved to folder {dataset_archive_folder.id}." 1409 ) 1410 else: 1411 raise LookupError( 1412 f"No datasets were found in the specified project: {projectId}. Re-check specified master_fileview in CONFIG and retry." 1413 ) 1414 return manifests, manifest_loaded
For each manifest csv in a project, look for all the entitiy ids that are associated. Look up the entitiy in the files, move the entity to new project.
1416 @tracer.start_as_current_span("SynapseStorage::get_synapse_table") 1417 def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable]: 1418 """Download synapse table as a pd dataframe; return table schema and etags as results too 1419 1420 Args: 1421 synapse_id: synapse ID of the table to query 1422 """ 1423 1424 results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id)) 1425 df = results.asDataFrame( 1426 rowIdAndVersionInIndex=False, 1427 na_values=STR_NA_VALUES_FILTERED, 1428 keep_default_na=False, 1429 ) 1430 1431 return df, results
Download synapse table as a pd dataframe; return table schema and etags as results too
Arguments:
- synapse_id: synapse ID of the table to query
542 def wrapper(*args, **kwargs): 543 try: 544 return method(*args, **kwargs) 545 except SynapseHTTPError as ex: 546 str_message = str(ex).replace("\n", "") 547 if "trash" in str_message or "does not exist" in str_message: 548 logging.warning(str_message) 549 return None 550 else: 551 raise ex
Method to upload a database to an asset store. In synapse, this will upload a metadata table
Arguments:
- dmge: DataModelGraphExplorer object
- manifest: pd.Df manifest to upload
- datasetId: synID of the dataset for the manifest
- table_name: name of the table to be uploaded
- restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
- existingTableId: str of the synId of the existing table, if one already exists
- table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
- table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
Returns:
manifest_table_id: synID of the uploaded table manifest: the original manifset table_manifest: manifest formatted appropriately for the table
1482 @tracer.start_as_current_span("SynapseStorage::formatDB") 1483 def formatDB(self, dmge, manifest, table_column_names): 1484 """ 1485 Method to format a manifest appropriatly for upload as table 1486 1487 Args: 1488 dmge: DataModelGraphExplorer object 1489 manifest: pd.Df manifest to upload 1490 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 1491 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 1492 display label formatting. 1493 Returns: 1494 col_schema: schema for table columns: type, size, etc 1495 table_manifest: formatted manifest 1496 1497 """ 1498 # Rename the manifest columns to display names to match fileview 1499 1500 blacklist_chars = ["(", ")", ".", " ", "-"] 1501 manifest_columns = manifest.columns.tolist() 1502 1503 table_manifest = deepcopy(manifest) 1504 1505 if table_column_names == "display_name": 1506 cols = table_manifest.columns 1507 1508 elif table_column_names == "display_label": 1509 cols = [ 1510 str(col).translate({ord(x): "" for x in blacklist_chars}) 1511 for col in manifest_columns 1512 ] 1513 1514 elif table_column_names == "class_label": 1515 cols = [ 1516 get_class_label_from_display_name(str(col)).translate( 1517 {ord(x): "" for x in blacklist_chars} 1518 ) 1519 for col in manifest_columns 1520 ] 1521 else: 1522 ValueError( 1523 f"The provided table_column_name: {table_column_names} is not valid, please resubmit with an allowed value only." 1524 ) 1525 1526 cols = list(map(lambda x: x.replace("EntityId", "entityId"), cols)) 1527 1528 # Reset column names in table manifest 1529 table_manifest.columns = cols 1530 1531 # move entity id to end of df 1532 entity_col = table_manifest.pop("entityId") 1533 table_manifest.insert(len(table_manifest.columns), "entityId", entity_col) 1534 1535 # Get the column schema 1536 col_schema = as_table_columns(table_manifest) 1537 1538 # Set Id column length to 64 (for some reason not being auto set.) 1539 for i, col in enumerate(col_schema): 1540 if col["name"].lower() == "id": 1541 col_schema[i]["maximumSize"] = 64 1542 1543 return col_schema, table_manifest
Method to format a manifest appropriatly for upload as table
Arguments:
- dmge: DataModelGraphExplorer object
- manifest: pd.Df manifest to upload
- table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
Returns:
col_schema: schema for table columns: type, size, etc table_manifest: formatted manifest
1545 @tracer.start_as_current_span("SynapseStorage::buildDB") 1546 def buildDB( 1547 self, 1548 datasetId: str, 1549 table_name: str, 1550 col_schema: List, 1551 table_manifest: pd.DataFrame, 1552 table_manipulation: str, 1553 dmge: DataModelGraphExplorer, 1554 restrict: bool = False, 1555 ): 1556 """ 1557 Method to construct the table appropriately: create new table, replace existing, or upsert new into existing 1558 Calls TableOperations class to execute 1559 1560 Args: 1561 datasetId: synID of the dataset for the manifest 1562 table_name: name of the table to be uploaded 1563 col_schema: schema for table columns: type, size, etc from `formatDB` 1564 table_manifest: formatted manifest that can be uploaded as a table 1565 table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert) 1566 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 1567 1568 Returns: 1569 manifest_table_id: synID of the uploaded table 1570 1571 """ 1572 table_parent_id = self.getDatasetProject(datasetId=datasetId) 1573 existing_table_id = self.syn.findEntityId( 1574 name=table_name, parent=table_parent_id 1575 ) 1576 tableOps = TableOperations( 1577 synStore=self, 1578 tableToLoad=table_manifest, 1579 tableName=table_name, 1580 datasetId=datasetId, 1581 existingTableId=existing_table_id, 1582 restrict=restrict, 1583 synapse_entity_tracker=self.synapse_entity_tracker, 1584 ) 1585 1586 if not table_manipulation or existing_table_id is None: 1587 manifest_table_id = tableOps.createTable( 1588 columnTypeDict=col_schema, 1589 specifySchema=True, 1590 ) 1591 elif existing_table_id is not None: 1592 if table_manipulation.lower() == "replace": 1593 manifest_table_id = tableOps.replaceTable( 1594 specifySchema=True, 1595 columnTypeDict=col_schema, 1596 ) 1597 elif table_manipulation.lower() == "upsert": 1598 manifest_table_id = tableOps.upsertTable( 1599 dmge=dmge, 1600 ) 1601 elif table_manipulation.lower() == "update": 1602 manifest_table_id = tableOps.updateTable() 1603 1604 if table_manipulation and table_manipulation.lower() == "upsert": 1605 table_entity = self.synapse_entity_tracker.get( 1606 synapse_id=existing_table_id or manifest_table_id, 1607 syn=self.syn, 1608 download_file=False, 1609 ) 1610 annos = OldAnnotations( 1611 id=table_entity.id, 1612 etag=table_entity.etag, 1613 values=table_entity.annotations, 1614 ) 1615 annos["primary_key"] = table_manifest["Component"][0] + "_id" 1616 annos = self.syn.set_annotations(annos) 1617 table_entity.etag = annos.etag 1618 table_entity.annotations = annos 1619 1620 return manifest_table_id
Method to construct the table appropriately: create new table, replace existing, or upsert new into existing Calls TableOperations class to execute
Arguments:
- datasetId: synID of the dataset for the manifest
- table_name: name of the table to be uploaded
- col_schema: schema for table columns: type, size, etc from
formatDB
- table_manifest: formatted manifest that can be uploaded as a table
- table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
- restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
Returns:
manifest_table_id: synID of the uploaded table
1622 @tracer.start_as_current_span("SynapseStorage::upload_manifest_file") 1623 def upload_manifest_file( 1624 self, 1625 manifest, 1626 metadataManifestPath, 1627 datasetId, 1628 restrict_manifest, 1629 component_name="", 1630 ): 1631 # Update manifest to have the new entityId column 1632 manifest.to_csv(metadataManifestPath, index=False) 1633 1634 # store manifest to Synapse as a CSV 1635 # update file name 1636 file_name_full = metadataManifestPath.split("/")[-1] 1637 file_extension = file_name_full.split(".")[-1] 1638 1639 # Differentiate "censored" and "uncensored" manifest 1640 if "censored" in file_name_full: 1641 file_name_new = ( 1642 os.path.basename(CONFIG.synapse_manifest_basename) 1643 + "_" 1644 + component_name 1645 + "_censored" 1646 + "." 1647 + file_extension 1648 ) 1649 else: 1650 file_name_new = ( 1651 os.path.basename(CONFIG.synapse_manifest_basename) 1652 + "_" 1653 + component_name 1654 + "." 1655 + file_extension 1656 ) 1657 1658 manifest_synapse_file = None 1659 try: 1660 # Rename the file to file_name_new then revert 1661 # This is to maintain the original file name in-case other code is 1662 # expecting that the file exists with the original name 1663 original_file_path = metadataManifestPath 1664 new_file_path = os.path.join( 1665 os.path.dirname(metadataManifestPath), file_name_new 1666 ) 1667 os.rename(original_file_path, new_file_path) 1668 1669 manifest_synapse_file = self._store_file_for_manifest_upload( 1670 new_file_path=new_file_path, 1671 dataset_id=datasetId, 1672 existing_file_name=file_name_full, 1673 file_name_new=file_name_new, 1674 restrict_manifest=restrict_manifest, 1675 ) 1676 manifest_synapse_file_id = manifest_synapse_file.id 1677 1678 finally: 1679 # Revert the file name back to the original 1680 os.rename(new_file_path, original_file_path) 1681 1682 if manifest_synapse_file: 1683 manifest_synapse_file.path = original_file_path 1684 1685 return manifest_synapse_file_id
1742 async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]: 1743 """get annotations asynchronously 1744 1745 Args: 1746 synapse_id (str): synapse id of the entity that the annotation belongs 1747 1748 Returns: 1749 Dict[str, Any]: The requested entity bundle matching 1750 <https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html> 1751 """ 1752 return await get_entity_id_bundle2( 1753 entity_id=synapse_id, 1754 request={"includeAnnotations": True}, 1755 synapse_client=self.syn, 1756 )
get annotations asynchronously
Arguments:
- synapse_id (str): synapse id of the entity that the annotation belongs
Returns:
Dict[str, Any]: The requested entity bundle matching https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html
1758 async def store_async_annotation(self, annotation_dict: dict) -> Annotations: 1759 """store annotation in an async way 1760 1761 Args: 1762 annotation_dict (dict): annotation in a dictionary format 1763 1764 Returns: 1765 Annotations: The stored annotations. 1766 """ 1767 annotation_data = Annotations.from_dict( 1768 synapse_annotations=annotation_dict["annotations"]["annotations"] 1769 ) 1770 annotation_class = Annotations( 1771 annotations=annotation_data, 1772 etag=annotation_dict["annotations"]["etag"], 1773 id=annotation_dict["annotations"]["id"], 1774 ) 1775 annotation_storage_result = await annotation_class.store_async( 1776 synapse_client=self.syn 1777 ) 1778 local_entity = self.synapse_entity_tracker.get( 1779 synapse_id=annotation_dict["annotations"]["id"], 1780 syn=self.syn, 1781 download_file=False, 1782 retrieve_if_not_present=False, 1783 ) 1784 if local_entity: 1785 local_entity.etag = annotation_storage_result.etag 1786 local_entity.annotations = annotation_storage_result 1787 return annotation_storage_result
store annotation in an async way
Arguments:
- annotation_dict (dict): annotation in a dictionary format
Returns:
Annotations: The stored annotations.
1789 def process_row_annotations( 1790 self, 1791 dmge: DataModelGraphExplorer, 1792 metadata_syn: Dict[str, Any], 1793 hide_blanks: bool, 1794 csv_list_regex: str, 1795 annos: Dict[str, Any], 1796 annotation_keys: str, 1797 ) -> Dict[str, Any]: 1798 """Processes metadata annotations based on the logic below: 1799 1. Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is: 1800 An empty or whitespace-only string. 1801 A NaN value (if the annotation is a float). 1802 if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key. 1803 if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key. 1804 1805 2. If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name". 1806 Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key. 1807 1808 3. For any other conditions, assigns the original value of anno_v to the annotation key (anno_k). 1809 1810 4. Returns the updated annotations dictionary. 1811 1812 Args: 1813 dmge (DataModelGraphExplorer): data model graph explorer 1814 metadata_syn (dict): metadata used for Synapse storage 1815 hideBlanks (bool): if true, does not upload annotation keys with blank values. 1816 csv_list_regex (str): Regex to match with comma separated list 1817 annos (Dict[str, Any]): dictionary of annotation returned from synapse 1818 annotation_keys (str): display_label/class_label 1819 1820 Returns: 1821 Dict[str, Any]: annotations as a dictionary 1822 1823 ```mermaid 1824 flowchart TD 1825 A[Start] --> C{Is anno_v empty, whitespace, or NaN?} 1826 C -- Yes --> D{Is hide_blanks True?} 1827 D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing] 1828 D -- No --> F[Assign empty string to annotation key] 1829 C -- No --> G{Is anno_v a string?} 1830 G -- No --> H[Assign original value of anno_v to annotation key] 1831 G -- Yes --> I{Does anno_v match csv_list_regex?} 1832 I -- Yes --> J[Get validation rule of anno_k] 1833 J --> K{Does the validation rule contain 'list'} 1834 K -- Yes --> L[Split anno_v by commas and assign as list] 1835 I -- No --> H 1836 K -- No --> H 1837 ``` 1838 """ 1839 for anno_k, anno_v in metadata_syn.items(): 1840 # Remove keys with nan or empty string values or string that only contains white space from dict of annotations to be uploaded 1841 # if present on current data annotation 1842 if hide_blanks and ( 1843 (isinstance(anno_v, str) and anno_v.strip() == "") 1844 or (isinstance(anno_v, float) and np.isnan(anno_v)) 1845 ): 1846 annos["annotations"]["annotations"].pop(anno_k) if anno_k in annos[ 1847 "annotations" 1848 ]["annotations"].keys() else annos["annotations"]["annotations"] 1849 continue 1850 1851 # Otherwise save annotation as approrpriate 1852 if isinstance(anno_v, float) and np.isnan(anno_v): 1853 annos["annotations"]["annotations"][anno_k] = "" 1854 continue 1855 1856 # Handle strings that match the csv_list_regex and pass the validation rule 1857 if isinstance(anno_v, str) and re.fullmatch(csv_list_regex, anno_v): 1858 # Use a dictionary to dynamically choose the argument 1859 param = ( 1860 {"node_display_name": anno_k} 1861 if annotation_keys == "display_label" 1862 else {"node_label": anno_k} 1863 ) 1864 node_validation_rules = dmge.get_node_validation_rules(**param) 1865 1866 if rule_in_rule_list("list", node_validation_rules): 1867 annos["annotations"]["annotations"][anno_k] = anno_v.split(",") 1868 continue 1869 # default: assign the original value 1870 annos["annotations"]["annotations"][anno_k] = anno_v 1871 1872 return annos
Processes metadata annotations based on the logic below:
- Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is: An empty or whitespace-only string. A NaN value (if the annotation is a float). if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key. if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key.
If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name". Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key.
For any other conditions, assigns the original value of anno_v to the annotation key (anno_k).
Returns the updated annotations dictionary.
Arguments:
- dmge (DataModelGraphExplorer): data model graph explorer
- metadata_syn (dict): metadata used for Synapse storage
- hideBlanks (bool): if true, does not upload annotation keys with blank values.
- csv_list_regex (str): Regex to match with comma separated list
- annos (Dict[str, Any]): dictionary of annotation returned from synapse
- annotation_keys (str): display_label/class_label
Returns:
Dict[str, Any]: annotations as a dictionary
flowchart TD A[Start] --> C{Is anno_v empty, whitespace, or NaN?} C -- Yes --> D{Is hide_blanks True?} D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing] D -- No --> F[Assign empty string to annotation key] C -- No --> G{Is anno_v a string?} G -- No --> H[Assign original value of anno_v to annotation key] G -- Yes --> I{Does anno_v match csv_list_regex?} I -- Yes --> J[Get validation rule of anno_k] J --> K{Does the validation rule contain 'list'} K -- Yes --> L[Split anno_v by commas and assign as list] I -- No --> H K -- No --> H
558 async def wrapper(*args: Any, **kwargs: Any) -> Any: 559 try: 560 return await method(*args, **kwargs) 561 except SynapseHTTPError as ex: 562 str_message = str(ex).replace("\n", "") 563 if "trash" in str_message or "does not exist" in str_message: 564 logging.warning(str_message) 565 return None 566 else: 567 raise ex
542 def wrapper(*args, **kwargs): 543 try: 544 return method(*args, **kwargs) 545 except SynapseHTTPError as ex: 546 str_message = str(ex).replace("\n", "") 547 if "trash" in str_message or "does not exist" in str_message: 548 logging.warning(str_message) 549 return None 550 else: 551 raise ex
Set annotations for the manifest (as a whole) so they can be applied to the manifest table or csv. For now just getting the Component.
2258 @tracer.start_as_current_span("SynapseStorage::add_annotations_to_entities_files") 2259 async def add_annotations_to_entities_files( 2260 self, 2261 dmge, 2262 manifest, 2263 manifest_record_type: str, 2264 datasetId: str, 2265 hideBlanks: bool, 2266 manifest_synapse_table_id="", 2267 annotation_keys: str = "class_label", 2268 ): 2269 """ 2270 Depending on upload type add Ids to entityId row. Add anotations to connected 2271 files and folders. Despite the name of this function, it also applies to folders. 2272 2273 Args: 2274 dmge: DataModelGraphExplorer Object 2275 manifest (pd.DataFrame): loaded df containing user supplied data. 2276 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2277 datasetId (str): synapse ID of folder containing the dataset 2278 hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2279 manifest_synapse_table_id (str): Default is an empty string ''. 2280 annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display 2281 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2282 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2283 Returns: 2284 manifest (pd.DataFrame): modified to add entitiyId as appropriate 2285 2286 """ 2287 2288 # Expected behavior is to annotate files if `Filename` is present and if file_annotations_upload is set to True regardless of `-mrt` setting 2289 if "filename" in [col.lower() for col in manifest.columns]: 2290 # get current list of files and store as dataframe 2291 dataset_files = self.getFilesInStorageDataset(datasetId) 2292 files_and_entityIds = self._get_file_entityIds( 2293 dataset_files=dataset_files, only_new_files=False 2294 ) 2295 file_df = pd.DataFrame(files_and_entityIds) 2296 2297 # Merge dataframes to add entityIds 2298 manifest = manifest.merge( 2299 file_df, how="left", on="Filename", suffixes=["_x", None] 2300 ).drop("entityId_x", axis=1) 2301 2302 # Fill `entityId` for each row if missing and annotate entity as appropriate 2303 requests = set() 2304 for idx, row in manifest.iterrows(): 2305 if not row["entityId"] and ( 2306 manifest_record_type == "file_and_entities" 2307 or manifest_record_type == "table_file_and_entities" 2308 ): 2309 manifest, entityId = self._create_entity_id( 2310 idx, row, manifest, datasetId 2311 ) 2312 elif not row["entityId"] and manifest_record_type == "table_and_file": 2313 # If not using entityIds, fill with manifest_table_id so 2314 row["entityId"] = manifest_synapse_table_id 2315 manifest.loc[idx, "entityId"] = manifest_synapse_table_id 2316 entityId = "" 2317 # If the row is the manifest table, do not add annotations 2318 elif row["entityId"] == manifest_synapse_table_id: 2319 entityId = "" 2320 else: 2321 # get the file id of the file to annotate, collected in above step. 2322 entityId = row["entityId"] 2323 2324 # Adding annotations to connected files. 2325 if entityId: 2326 # Format annotations for Synapse 2327 annos_task = asyncio.create_task( 2328 self.format_row_annotations( 2329 dmge, row, entityId, hideBlanks, annotation_keys 2330 ) 2331 ) 2332 requests.add(annos_task) 2333 await self._process_store_annos(requests) 2334 return manifest
Depending on upload type add Ids to entityId row. Add anotations to connected files and folders. Despite the name of this function, it also applies to folders.
Arguments:
- dmge: DataModelGraphExplorer Object
- manifest (pd.DataFrame): loaded df containing user supplied data.
- manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
- datasetId (str): synapse ID of folder containing the dataset
- hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
- manifest_synapse_table_id (str): Default is an empty string ''.
- annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
Returns:
manifest (pd.DataFrame): modified to add entitiyId as appropriate
2336 @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_table") 2337 def upload_manifest_as_table( 2338 self, 2339 dmge: DataModelGraphExplorer, 2340 manifest: pd.DataFrame, 2341 metadataManifestPath: str, 2342 datasetId: str, 2343 table_name: str, 2344 component_name: str, 2345 restrict: bool, 2346 manifest_record_type: str, 2347 hideBlanks: bool, 2348 table_manipulation: str, 2349 table_column_names: str, 2350 annotation_keys: str, 2351 file_annotations_upload: bool = True, 2352 ): 2353 """Upload manifest to Synapse as a table and csv. 2354 Args: 2355 dmge: DataModelGraphExplorer object 2356 manifest (pd.DataFrame): loaded df containing user supplied data. 2357 metadataManifestPath: path to csv containing a validated metadata manifest. 2358 datasetId (str): synapse ID of folder containing the dataset 2359 table_name (str): Generated to name the table being uploaded. 2360 component_name (str): Name of the component manifest that is currently being uploaded. 2361 restrict (bool): Flag for censored data. 2362 manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2363 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2364 table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2365 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2366 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2367 display label formatting. 2368 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2369 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2370 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2371 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2372 Return: 2373 manifest_synapse_file_id: SynID of manifest csv uploaded to synapse. 2374 """ 2375 # Upload manifest as a table, get the ID and updated manifest. 2376 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2377 dmge=dmge, 2378 manifest=manifest, 2379 datasetId=datasetId, 2380 table_name=table_name, 2381 restrict=restrict, 2382 table_manipulation=table_manipulation, 2383 table_column_names=table_column_names, 2384 ) 2385 2386 if file_annotations_upload: 2387 manifest = asyncio.run( 2388 self.add_annotations_to_entities_files( 2389 dmge, 2390 manifest, 2391 manifest_record_type, 2392 datasetId, 2393 hideBlanks, 2394 manifest_synapse_table_id, 2395 annotation_keys, 2396 ) 2397 ) 2398 # Load manifest to synapse as a CSV File 2399 manifest_synapse_file_id = self.upload_manifest_file( 2400 manifest=manifest, 2401 metadataManifestPath=metadataManifestPath, 2402 datasetId=datasetId, 2403 restrict_manifest=restrict, 2404 component_name=component_name, 2405 ) 2406 2407 # Set annotations for the file manifest. 2408 manifest_annotations = self.format_manifest_annotations( 2409 manifest=manifest, manifest_synapse_id=manifest_synapse_file_id 2410 ) 2411 annos = self.syn.set_annotations(annotations=manifest_annotations) 2412 manifest_entity = self.synapse_entity_tracker.get( 2413 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2414 ) 2415 manifest_entity.annotations = annos 2416 manifest_entity.etag = annos.etag 2417 2418 logger.info("Associated manifest file with dataset on Synapse.") 2419 2420 # Update manifest Synapse table with new entity id column. 2421 manifest_synapse_table_id, manifest, _ = self.uploadDB( 2422 dmge=dmge, 2423 manifest=manifest, 2424 datasetId=datasetId, 2425 table_name=table_name, 2426 restrict=restrict, 2427 table_manipulation="update", 2428 table_column_names=table_column_names, 2429 ) 2430 2431 # Set annotations for the table manifest 2432 manifest_annotations = self.format_manifest_annotations( 2433 manifest=manifest, manifest_synapse_id=manifest_synapse_table_id 2434 ) 2435 annotations_manifest_table = self.syn.set_annotations( 2436 annotations=manifest_annotations 2437 ) 2438 manifest_table_entity = self.synapse_entity_tracker.get( 2439 synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False 2440 ) 2441 manifest_table_entity.annotations = annotations_manifest_table 2442 manifest_table_entity.etag = annotations_manifest_table.etag 2443 2444 return manifest_synapse_file_id
Upload manifest to Synapse as a table and csv.
Arguments:
- dmge: DataModelGraphExplorer object
- manifest (pd.DataFrame): loaded df containing user supplied data.
- metadataManifestPath: path to csv containing a validated metadata manifest.
- datasetId (str): synapse ID of folder containing the dataset
- table_name (str): Generated to name the table being uploaded.
- component_name (str): Name of the component manifest that is currently being uploaded.
- restrict (bool): Flag for censored data.
- manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
- hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
- table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
- table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
- annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
- file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
Return:
manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2446 @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_csv") 2447 def upload_manifest_as_csv( 2448 self, 2449 dmge, 2450 manifest, 2451 metadataManifestPath, 2452 datasetId, 2453 restrict, 2454 manifest_record_type, 2455 hideBlanks, 2456 component_name, 2457 annotation_keys: str, 2458 file_annotations_upload: bool = True, 2459 ): 2460 """Upload manifest to Synapse as a csv only. 2461 Args: 2462 dmge: DataModelGraphExplorer object 2463 manifest (pd.DataFrame): loaded df containing user supplied data. 2464 metadataManifestPath: path to csv containing a validated metadata manifest. 2465 datasetId (str): synapse ID of folder containing the dataset 2466 restrict (bool): Flag for censored data. 2467 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2468 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2469 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2470 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2471 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2472 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2473 Return: 2474 manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. 2475 """ 2476 if file_annotations_upload: 2477 manifest = asyncio.run( 2478 self.add_annotations_to_entities_files( 2479 dmge, 2480 manifest, 2481 manifest_record_type, 2482 datasetId, 2483 hideBlanks, 2484 annotation_keys=annotation_keys, 2485 ) 2486 ) 2487 2488 # Load manifest to synapse as a CSV File 2489 manifest_synapse_file_id = self.upload_manifest_file( 2490 manifest, 2491 metadataManifestPath, 2492 datasetId, 2493 restrict, 2494 component_name=component_name, 2495 ) 2496 2497 # Set annotations for the file manifest. 2498 manifest_annotations = self.format_manifest_annotations( 2499 manifest, manifest_synapse_file_id 2500 ) 2501 annos = self.syn.set_annotations(manifest_annotations) 2502 manifest_entity = self.synapse_entity_tracker.get( 2503 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2504 ) 2505 manifest_entity.annotations = annos 2506 manifest_entity.etag = annos.etag 2507 2508 logger.info("Associated manifest file with dataset on Synapse.") 2509 2510 return manifest_synapse_file_id
Upload manifest to Synapse as a csv only.
Arguments:
- dmge: DataModelGraphExplorer object
- manifest (pd.DataFrame): loaded df containing user supplied data.
- metadataManifestPath: path to csv containing a validated metadata manifest.
- datasetId (str): synapse ID of folder containing the dataset
- restrict (bool): Flag for censored data.
- manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
- hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
- annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
- file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
Return:
manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2512 @tracer.start_as_current_span("SynapseStorage::upload_manifest_combo") 2513 def upload_manifest_combo( 2514 self, 2515 dmge, 2516 manifest, 2517 metadataManifestPath, 2518 datasetId, 2519 table_name, 2520 component_name, 2521 restrict, 2522 manifest_record_type, 2523 hideBlanks, 2524 table_manipulation, 2525 table_column_names: str, 2526 annotation_keys: str, 2527 file_annotations_upload: bool = True, 2528 ): 2529 """Upload manifest to Synapse as a table and CSV with entities. 2530 Args: 2531 dmge: DataModelGraphExplorer object 2532 manifest (pd.DataFrame): loaded df containing user supplied data. 2533 metadataManifestPath: path to csv containing a validated metadata manifest. 2534 datasetId (str): synapse ID of folder containing the dataset 2535 table_name (str): Generated to name the table being uploaded. 2536 component_name (str): Name of the component manifest that is currently being uploaded. 2537 restrict (bool): Flag for censored data. 2538 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2539 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2540 table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2541 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2542 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2543 display label formatting. 2544 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2545 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2546 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2547 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2548 Return: 2549 manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. 2550 """ 2551 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2552 dmge=dmge, 2553 manifest=manifest, 2554 datasetId=datasetId, 2555 table_name=table_name, 2556 restrict=restrict, 2557 table_manipulation=table_manipulation, 2558 table_column_names=table_column_names, 2559 ) 2560 2561 if file_annotations_upload: 2562 manifest = asyncio.run( 2563 self.add_annotations_to_entities_files( 2564 dmge, 2565 manifest, 2566 manifest_record_type, 2567 datasetId, 2568 hideBlanks, 2569 manifest_synapse_table_id, 2570 annotation_keys=annotation_keys, 2571 ) 2572 ) 2573 2574 # Load manifest to synapse as a CSV File 2575 manifest_synapse_file_id = self.upload_manifest_file( 2576 manifest, metadataManifestPath, datasetId, restrict, component_name 2577 ) 2578 2579 # Set annotations for the file manifest. 2580 manifest_annotations = self.format_manifest_annotations( 2581 manifest, manifest_synapse_file_id 2582 ) 2583 file_manifest_annoations = self.syn.set_annotations(manifest_annotations) 2584 manifest_entity = self.synapse_entity_tracker.get( 2585 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2586 ) 2587 manifest_entity.annotations = file_manifest_annoations 2588 manifest_entity.etag = file_manifest_annoations.etag 2589 logger.info("Associated manifest file with dataset on Synapse.") 2590 2591 # Update manifest Synapse table with new entity id column. 2592 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2593 dmge=dmge, 2594 manifest=manifest, 2595 datasetId=datasetId, 2596 table_name=table_name, 2597 restrict=restrict, 2598 table_manipulation="update", 2599 table_column_names=table_column_names, 2600 ) 2601 2602 # Set annotations for the table manifest 2603 manifest_annotations = self.format_manifest_annotations( 2604 manifest, manifest_synapse_table_id 2605 ) 2606 table_manifest_annotations = self.syn.set_annotations(manifest_annotations) 2607 manifest_entity = self.synapse_entity_tracker.get( 2608 synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False 2609 ) 2610 manifest_entity.annotations = table_manifest_annotations 2611 manifest_entity.etag = table_manifest_annotations.etag 2612 return manifest_synapse_file_id
Upload manifest to Synapse as a table and CSV with entities.
Arguments:
- dmge: DataModelGraphExplorer object
- manifest (pd.DataFrame): loaded df containing user supplied data.
- metadataManifestPath: path to csv containing a validated metadata manifest.
- datasetId (str): synapse ID of folder containing the dataset
- table_name (str): Generated to name the table being uploaded.
- component_name (str): Name of the component manifest that is currently being uploaded.
- restrict (bool): Flag for censored data.
- manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
- hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
- table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
- table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
- annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
- file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
Return:
manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2614 @tracer.start_as_current_span("SynapseStorage::associateMetadataWithFiles") 2615 def associateMetadataWithFiles( 2616 self, 2617 dmge: DataModelGraphExplorer, 2618 metadataManifestPath: str, 2619 datasetId: str, 2620 manifest_record_type: str = "table_file_and_entities", 2621 hideBlanks: bool = False, 2622 restrict_manifest=False, 2623 table_manipulation: str = "replace", 2624 table_column_names: str = "class_label", 2625 annotation_keys: str = "class_label", 2626 file_annotations_upload: bool = True, 2627 ) -> str: 2628 """Associate metadata with files in a storage dataset already on Synapse. 2629 Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file. 2630 2631 If this is a new manifest there could be no Synapse entities associated with the rows of this manifest 2632 this may be due to data type (e.g. clinical data) being tabular 2633 and not requiring files; to utilize uniform interfaces downstream 2634 (i.e. fileviews), a Synapse entity (a folder) is created for each row 2635 and an entity column is added to the manifest containing the resulting 2636 entity IDs; a table is also created at present as an additional interface 2637 for downstream query and interaction with the data. 2638 2639 Args: 2640 dmge: DataModelGraphExplorer Object 2641 metadataManifestPath: path to csv containing a validated metadata manifest. 2642 The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type. 2643 Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item. 2644 In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file. 2645 datasetId: synapse ID of folder containing the dataset 2646 manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination. 2647 hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2648 restrict_manifest (bool): Default is false. Flag for censored data. 2649 table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2650 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2651 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2652 display label formatting. 2653 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2654 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2655 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2656 Returns: 2657 manifest_synapse_file_id: SynID of manifest csv uploaded to synapse. 2658 """ 2659 # Read new manifest CSV: 2660 manifest = self._read_manifest(metadataManifestPath) 2661 manifest = self._add_id_columns_to_manifest(manifest, dmge) 2662 2663 table_name, component_name = self._generate_table_name(manifest) 2664 2665 # Upload manifest to synapse based on user input (manifest_record_type) 2666 if manifest_record_type == "file_only": 2667 manifest_synapse_file_id = self.upload_manifest_as_csv( 2668 dmge=dmge, 2669 manifest=manifest, 2670 metadataManifestPath=metadataManifestPath, 2671 datasetId=datasetId, 2672 restrict=restrict_manifest, 2673 hideBlanks=hideBlanks, 2674 manifest_record_type=manifest_record_type, 2675 component_name=component_name, 2676 annotation_keys=annotation_keys, 2677 file_annotations_upload=file_annotations_upload, 2678 ) 2679 elif manifest_record_type == "table_and_file": 2680 manifest_synapse_file_id = self.upload_manifest_as_table( 2681 dmge=dmge, 2682 manifest=manifest, 2683 metadataManifestPath=metadataManifestPath, 2684 datasetId=datasetId, 2685 table_name=table_name, 2686 component_name=component_name, 2687 restrict=restrict_manifest, 2688 hideBlanks=hideBlanks, 2689 manifest_record_type=manifest_record_type, 2690 table_manipulation=table_manipulation, 2691 table_column_names=table_column_names, 2692 annotation_keys=annotation_keys, 2693 file_annotations_upload=file_annotations_upload, 2694 ) 2695 elif manifest_record_type == "file_and_entities": 2696 manifest_synapse_file_id = self.upload_manifest_as_csv( 2697 dmge=dmge, 2698 manifest=manifest, 2699 metadataManifestPath=metadataManifestPath, 2700 datasetId=datasetId, 2701 restrict=restrict_manifest, 2702 hideBlanks=hideBlanks, 2703 manifest_record_type=manifest_record_type, 2704 component_name=component_name, 2705 annotation_keys=annotation_keys, 2706 file_annotations_upload=file_annotations_upload, 2707 ) 2708 elif manifest_record_type == "table_file_and_entities": 2709 manifest_synapse_file_id = self.upload_manifest_combo( 2710 dmge=dmge, 2711 manifest=manifest, 2712 metadataManifestPath=metadataManifestPath, 2713 datasetId=datasetId, 2714 table_name=table_name, 2715 component_name=component_name, 2716 restrict=restrict_manifest, 2717 hideBlanks=hideBlanks, 2718 manifest_record_type=manifest_record_type, 2719 table_manipulation=table_manipulation, 2720 table_column_names=table_column_names, 2721 annotation_keys=annotation_keys, 2722 file_annotations_upload=file_annotations_upload, 2723 ) 2724 else: 2725 raise ValueError("Please enter a valid manifest_record_type.") 2726 return manifest_synapse_file_id
Associate metadata with files in a storage dataset already on Synapse. Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file.
If this is a new manifest there could be no Synapse entities associated with the rows of this manifest this may be due to data type (e.g. clinical data) being tabular and not requiring files; to utilize uniform interfaces downstream (i.e. fileviews), a Synapse entity (a folder) is created for each row and an entity column is added to the manifest containing the resulting entity IDs; a table is also created at present as an additional interface for downstream query and interaction with the data.
Arguments:
- dmge: DataModelGraphExplorer Object
- metadataManifestPath: path to csv containing a validated metadata manifest.
- The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type.
- Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item.
- In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file.
- datasetId: synapse ID of folder containing the dataset
- manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination.
- hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
- restrict_manifest (bool): Default is false. Flag for censored data.
- table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
- table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
- annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
Returns:
manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2728 def getTableAnnotations(self, table_id: str): 2729 """Generate dictionary of annotations for the given Synapse file. 2730 Synapse returns all custom annotations as lists since they 2731 can contain multiple values. In all cases, the values will 2732 be converted into strings and concatenated with ", ". 2733 2734 Args: 2735 fileId (str): Synapse ID for dataset file. 2736 2737 Returns: 2738 dict: Annotations as comma-separated strings. 2739 """ 2740 try: 2741 entity = self.synapse_entity_tracker.get( 2742 synapse_id=table_id, syn=self.syn, download_file=False 2743 ) 2744 is_table = entity.concreteType.endswith(".TableEntity") 2745 annotations_raw = entity.annotations 2746 except SynapseHTTPError: 2747 # If an error occurs with retrieving entity, skip it 2748 # This could be caused by a temporary file view that 2749 # was deleted since its ID was retrieved 2750 is_file, is_table = False, False 2751 2752 # Skip anything that isn't a file or folder 2753 if not (is_table): 2754 return None 2755 2756 annotations = self.getEntityAnnotations(table_id, entity, annotations_raw) 2757 2758 return annotations
Generate dictionary of annotations for the given Synapse file. Synapse returns all custom annotations as lists since they can contain multiple values. In all cases, the values will be converted into strings and concatenated with ", ".
Arguments:
- fileId (str): Synapse ID for dataset file.
Returns:
dict: Annotations as comma-separated strings.
2760 def getFileAnnotations(self, fileId: str) -> Dict[str, str]: 2761 """Generate dictionary of annotations for the given Synapse file. 2762 Synapse returns all custom annotations as lists since they 2763 can contain multiple values. In all cases, the values will 2764 be converted into strings and concatenated with ", ". 2765 2766 Args: 2767 fileId (str): Synapse ID for dataset file. 2768 2769 Returns: 2770 dict: Annotations as comma-separated strings. 2771 """ 2772 2773 # Get entity metadata, including annotations 2774 try: 2775 entity = self.synapse_entity_tracker.get( 2776 synapse_id=fileId, syn=self.syn, download_file=False 2777 ) 2778 is_file = entity.concreteType.endswith(".FileEntity") 2779 is_folder = entity.concreteType.endswith(".Folder") 2780 annotations_raw = entity.annotations 2781 except SynapseHTTPError: 2782 # If an error occurs with retrieving entity, skip it 2783 # This could be caused by a temporary file view that 2784 # was deleted since its ID was retrieved 2785 is_file, is_folder = False, False 2786 2787 # Skip anything that isn't a file or folder 2788 if not (is_file or is_folder): 2789 return None 2790 2791 annotations = self.getEntityAnnotations(fileId, entity, annotations_raw) 2792 2793 return annotations
Generate dictionary of annotations for the given Synapse file. Synapse returns all custom annotations as lists since they can contain multiple values. In all cases, the values will be converted into strings and concatenated with ", ".
Arguments:
- fileId (str): Synapse ID for dataset file.
Returns:
dict: Annotations as comma-separated strings.
2795 def getEntityAnnotations(self, fileId, entity, annotations_raw): 2796 # Extract annotations from their lists and stringify. For example: 2797 # {'YearofBirth': [1980], 'author': ['bruno', 'milen', 'sujay']} 2798 annotations = dict() 2799 for key, vals in annotations_raw.items(): 2800 if isinstance(vals, list) and len(vals) == 1: 2801 annotations[key] = str(vals[0]) 2802 else: 2803 annotations[key] = ", ".join(str(v) for v in vals) 2804 2805 # Add the file entity ID and eTag, which weren't lists 2806 assert fileId == entity.id, ( 2807 "For some reason, the Synapse ID in the response doesn't match" 2808 "the Synapse ID sent in the request (via synapseclient)." 2809 ) 2810 annotations["entityId"] = fileId 2811 annotations["eTag"] = entity.etag 2812 2813 return annotations
2815 def getDatasetAnnotations( 2816 self, datasetId: str, fill_na: bool = True, force_batch: bool = False 2817 ) -> pd.DataFrame: 2818 """Generate table for annotations across all files in given dataset. 2819 2820 Args: 2821 datasetId (str): Synapse ID for dataset folder. 2822 fill_na (bool): Whether to replace missing values with 2823 blank strings. 2824 force_batch (bool): Whether to force the function to use 2825 the batch mode, which uses a file view to retrieve 2826 annotations for a given dataset. Default to False 2827 unless there are more than 50 files in the dataset. 2828 2829 Returns: 2830 pd.DataFrame: Table of annotations. 2831 """ 2832 # Get all files in given dataset 2833 dataset_files = self.getFilesInStorageDataset(datasetId) 2834 2835 # if there are no dataset files, there are no annotations 2836 # return None 2837 if not dataset_files: 2838 return pd.DataFrame() 2839 2840 dataset_files_map = dict(dataset_files) 2841 dataset_file_ids, _ = list(zip(*dataset_files)) 2842 2843 # Get annotations for each file from Step 1 2844 # Batch mode 2845 try_batch = len(dataset_files) >= 50 or force_batch 2846 if try_batch: 2847 try: 2848 logger.info("Trying batch mode for retrieving Synapse annotations") 2849 table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids) 2850 except (SynapseAuthenticationError, SynapseHTTPError, ValueError): 2851 logger.info( 2852 f"Unable to create a temporary file view bound to {datasetId}. " 2853 "Defaulting to slower iterative retrieval of annotations." 2854 ) 2855 # Default to the slower non-batch method 2856 logger.info("Batch mode failed (probably due to permission error)") 2857 try_batch = False 2858 2859 # Non-batch mode 2860 if not try_batch: 2861 logger.info("Using slower (non-batch) sequential mode") 2862 records = [self.getFileAnnotations(i) for i in dataset_file_ids] 2863 # Remove any annotations for non-file/folders (stored as None) 2864 records = filter(None, records) 2865 table = pd.DataFrame.from_records(records) 2866 2867 # Add filenames for the files that "survived" annotation retrieval 2868 filenames = [dataset_files_map[i] for i in table["entityId"]] 2869 2870 if "Filename" not in table.columns: 2871 table.insert(0, "Filename", filenames) 2872 2873 # Ensure that entityId and eTag are at the end 2874 entity_ids = table.pop("entityId") 2875 etags = table.pop("eTag") 2876 table.insert(len(table.columns), "entityId", entity_ids) 2877 table.insert(len(table.columns), "eTag", etags) 2878 2879 # Missing values are filled in with empty strings for Google Sheets 2880 if fill_na: 2881 table.fillna("", inplace=True) 2882 2883 # Force all values as strings 2884 return table.astype(str)
Generate table for annotations across all files in given dataset.
Arguments:
- datasetId (str): Synapse ID for dataset folder.
- fill_na (bool): Whether to replace missing values with blank strings.
- force_batch (bool): Whether to force the function to use the batch mode, which uses a file view to retrieve annotations for a given dataset. Default to False unless there are more than 50 files in the dataset.
Returns:
pd.DataFrame: Table of annotations.
2898 @tracer.start_as_current_span("SynapseStorage::getDatasetProject") 2899 @retry( 2900 stop=stop_after_attempt(5), 2901 wait=wait_chain( 2902 *[wait_fixed(10) for i in range(2)] 2903 + [wait_fixed(15) for i in range(2)] 2904 + [wait_fixed(20)] 2905 ), 2906 retry=retry_if_exception_type(LookupError), 2907 retry_error_callback=raise_final_error, 2908 ) 2909 def getDatasetProject(self, datasetId: str) -> str: 2910 """Get parent project for a given dataset ID. 2911 2912 Args: 2913 datasetId (str): Synapse entity ID (folder or project). 2914 2915 Raises: 2916 ValueError: Raised if Synapse ID cannot be retrieved 2917 by the user or if it doesn't appear in the file view. 2918 2919 Returns: 2920 str: The Synapse ID for the parent project. 2921 """ 2922 2923 # Subset main file view 2924 dataset_index = self.storageFileviewTable["id"] == datasetId 2925 dataset_row = self.storageFileviewTable[dataset_index] 2926 2927 # re-query if no datasets found 2928 if dataset_row.empty: 2929 sleep(5) 2930 self.query_fileview(force_requery=True) 2931 # Subset main file view 2932 dataset_index = self.storageFileviewTable["id"] == datasetId 2933 dataset_row = self.storageFileviewTable[dataset_index] 2934 2935 # Return `projectId` for given row if only one found 2936 if len(dataset_row) == 1: 2937 dataset_project = dataset_row["projectId"].values[0] 2938 return dataset_project 2939 2940 # Otherwise, check if already project itself 2941 try: 2942 syn_object = self.synapse_entity_tracker.get( 2943 synapse_id=datasetId, syn=self.syn, download_file=False 2944 ) 2945 if syn_object.properties["concreteType"].endswith("Project"): 2946 return datasetId 2947 except SynapseHTTPError: 2948 raise PermissionError( 2949 f"The given dataset ({datasetId}) isn't accessible with this " 2950 "user. This might be caused by a typo in the dataset Synapse ID." 2951 ) 2952 2953 # If not, then assume dataset not in file view 2954 raise LookupError( 2955 f"The given dataset ({datasetId}) doesn't appear in the " 2956 f"configured file view ({self.storageFileview}). This might " 2957 "mean that the file view's scope needs to be updated." 2958 )
Get parent project for a given dataset ID.
Arguments:
- datasetId (str): Synapse entity ID (folder or project).
Raises:
- ValueError: Raised if Synapse ID cannot be retrieved
- by the user or if it doesn't appear in the file view.
Returns:
str: The Synapse ID for the parent project.
2960 def getDatasetAnnotationsBatch( 2961 self, datasetId: str, dataset_file_ids: Sequence[str] = None 2962 ) -> pd.DataFrame: 2963 """Generate table for annotations across all files in given dataset. 2964 This function uses a temporary file view to generate a table 2965 instead of iteratively querying for individual entity annotations. 2966 This function is expected to run much faster than 2967 `self.getDatasetAnnotationsBatch` on large datasets. 2968 2969 Args: 2970 datasetId (str): Synapse ID for dataset folder. 2971 dataset_file_ids (Sequence[str]): List of Synapse IDs 2972 for dataset files/folders used to subset the table. 2973 2974 Returns: 2975 pd.DataFrame: Table of annotations. 2976 """ 2977 # Create data frame from annotations file view 2978 with DatasetFileView(datasetId, self.syn) as fileview: 2979 table = fileview.query() 2980 2981 if dataset_file_ids: 2982 table = table.loc[table.index.intersection(dataset_file_ids)] 2983 2984 table = table.reset_index(drop=True) 2985 2986 return table
Generate table for annotations across all files in given dataset.
This function uses a temporary file view to generate a table
instead of iteratively querying for individual entity annotations.
This function is expected to run much faster than
self.getDatasetAnnotationsBatch
on large datasets.
Arguments:
- datasetId (str): Synapse ID for dataset folder.
- dataset_file_ids (Sequence[str]): List of Synapse IDs for dataset files/folders used to subset the table.
Returns:
pd.DataFrame: Table of annotations.
2999class TableOperations: 3000 """ 3001 Object to hold functions for various table operations specific to the Synapse Asset Store. 3002 3003 Currently implement operations are: 3004 createTable: upload a manifest as a new table when none exist 3005 replaceTable: replace a metadata in a table from one manifest with metadata from another manifest 3006 updateTable: add a column to a table that already exists on synapse 3007 3008 Operations currently in development are: 3009 upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest 3010 """ 3011 3012 def __init__( 3013 self, 3014 synStore: SynapseStorage, 3015 tableToLoad: pd.DataFrame = None, 3016 tableName: str = None, 3017 datasetId: str = None, 3018 existingTableId: str = None, 3019 restrict: bool = False, 3020 synapse_entity_tracker: SynapseEntityTracker = None, 3021 ): 3022 """ 3023 Class governing table operations (creation, replacement, upserts, updates) in schematic 3024 3025 tableToLoad: manifest formatted appropriately for the table 3026 tableName: name of the table to be uploaded 3027 datasetId: synID of the dataset for the manifest 3028 existingTableId: synId of the table currently exising on synapse (if there is one) 3029 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 3030 synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities 3031 3032 """ 3033 self.synStore = synStore 3034 self.tableToLoad = tableToLoad 3035 self.tableName = tableName 3036 self.datasetId = datasetId 3037 self.existingTableId = existingTableId 3038 self.restrict = restrict 3039 self.synapse_entity_tracker = synapse_entity_tracker or SynapseEntityTracker() 3040 3041 @tracer.start_as_current_span("TableOperations::createTable") 3042 def createTable( 3043 self, 3044 columnTypeDict: dict = None, 3045 specifySchema: bool = True, 3046 ): 3047 """ 3048 Method to create a table from a metadata manifest and upload it to synapse 3049 3050 Args: 3051 columnTypeDict: dictionary schema for table columns: type, size, etc 3052 specifySchema: to specify a specific schema for the table format 3053 3054 Returns: 3055 table.schema.id: synID of the newly created table 3056 """ 3057 datasetEntity = self.synapse_entity_tracker.get( 3058 synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False 3059 ) 3060 datasetName = datasetEntity.name 3061 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3062 3063 if not self.tableName: 3064 self.tableName = datasetName + "table" 3065 datasetParentProject = self.synStore.getDatasetProject(self.datasetId) 3066 if specifySchema: 3067 if columnTypeDict == {}: 3068 logger.error("Did not provide a columnTypeDict.") 3069 # create list of columns: 3070 cols = [] 3071 for col in self.tableToLoad.columns: 3072 if col in table_schema_by_cname: 3073 col_type = table_schema_by_cname[col]["columnType"] 3074 max_size = ( 3075 table_schema_by_cname[col]["maximumSize"] 3076 if "maximumSize" in table_schema_by_cname[col].keys() 3077 else 100 3078 ) 3079 max_list_len = 250 3080 if max_size and max_list_len: 3081 cols.append( 3082 Column( 3083 name=col, 3084 columnType=col_type, 3085 maximumSize=max_size, 3086 maximumListLength=max_list_len, 3087 ) 3088 ) 3089 elif max_size: 3090 cols.append( 3091 Column(name=col, columnType=col_type, maximumSize=max_size) 3092 ) 3093 else: 3094 cols.append(Column(name=col, columnType=col_type)) 3095 else: 3096 # TODO add warning that the given col was not found and it's max size is set to 100 3097 cols.append(Column(name=col, columnType="STRING", maximumSize=100)) 3098 schema = Schema( 3099 name=self.tableName, columns=cols, parent=datasetParentProject 3100 ) 3101 table = Table(schema, self.tableToLoad) 3102 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3103 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3104 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3105 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3106 return table.schema.id 3107 else: 3108 # For just uploading the tables to synapse using default 3109 # column types. 3110 table = build_table(self.tableName, datasetParentProject, self.tableToLoad) 3111 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3112 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3113 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3114 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3115 return table.schema.id 3116 3117 @tracer.start_as_current_span("TableOperations::replaceTable") 3118 def replaceTable( 3119 self, 3120 specifySchema: bool = True, 3121 columnTypeDict: dict = None, 3122 ): 3123 """ 3124 Method to replace an existing table on synapse with metadata from a new manifest 3125 3126 Args: 3127 specifySchema: to infer a schema for the table format 3128 columnTypeDict: dictionary schema for table columns: type, size, etc 3129 3130 Returns: 3131 existingTableId: synID of the already existing table that had its metadata replaced 3132 """ 3133 datasetEntity = self.synapse_entity_tracker.get( 3134 synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False 3135 ) 3136 3137 datasetName = datasetEntity.name 3138 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3139 existing_table, existing_results = self.synStore.get_synapse_table( 3140 self.existingTableId 3141 ) 3142 # remove rows 3143 self.synStore.syn.delete(existing_results) 3144 # Data changes such as removing all rows causes the eTag to change. 3145 self.synapse_entity_tracker.remove(synapse_id=self.existingTableId) 3146 # wait for row deletion to finish on synapse before getting empty table 3147 sleep(10) 3148 3149 # removes all current columns 3150 current_table = self.synapse_entity_tracker.get( 3151 synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False 3152 ) 3153 3154 current_columns = self.synStore.syn.getTableColumns(current_table) 3155 3156 for col in current_columns: 3157 current_table.removeColumn(col) 3158 3159 if not self.tableName: 3160 self.tableName = datasetName + "table" 3161 3162 # Process columns according to manifest entries 3163 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3164 datasetParentProject = self.synStore.getDatasetProject(self.datasetId) 3165 if specifySchema: 3166 if columnTypeDict == {}: 3167 logger.error("Did not provide a columnTypeDict.") 3168 # create list of columns: 3169 cols = [] 3170 3171 for col in self.tableToLoad.columns: 3172 if col in table_schema_by_cname: 3173 col_type = table_schema_by_cname[col]["columnType"] 3174 max_size = ( 3175 table_schema_by_cname[col]["maximumSize"] 3176 if "maximumSize" in table_schema_by_cname[col].keys() 3177 else 100 3178 ) 3179 max_list_len = 250 3180 if max_size and max_list_len: 3181 cols.append( 3182 Column( 3183 name=col, 3184 columnType=col_type, 3185 maximumSize=max_size, 3186 maximumListLength=max_list_len, 3187 ) 3188 ) 3189 elif max_size: 3190 cols.append( 3191 Column(name=col, columnType=col_type, maximumSize=max_size) 3192 ) 3193 else: 3194 cols.append(Column(name=col, columnType=col_type)) 3195 else: 3196 # TODO add warning that the given col was not found and it's max size is set to 100 3197 cols.append(Column(name=col, columnType="STRING", maximumSize=100)) 3198 3199 # adds new columns to schema 3200 for col in cols: 3201 current_table.addColumn(col) 3202 3203 table_result = self.synStore.syn.store( 3204 current_table, isRestricted=self.restrict 3205 ) 3206 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3207 # self.synapse_entity_tracker.add(synapse_id=table_result.schema.id, entity=table_result.schema) 3208 self.synapse_entity_tracker.remove(synapse_id=table_result.id) 3209 3210 # wait for synapse store to finish 3211 sleep(1) 3212 3213 # build schema and table from columns and store with necessary restrictions 3214 schema = Schema( 3215 name=self.tableName, columns=cols, parent=datasetParentProject 3216 ) 3217 schema.id = self.existingTableId 3218 table = Table(schema, self.tableToLoad, etag=existing_results.etag) 3219 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3220 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3221 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3222 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3223 else: 3224 logging.error("Must specify a schema for table replacements") 3225 3226 # remove system metadata from manifest 3227 existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True) 3228 return self.existingTableId 3229 3230 @tracer.start_as_current_span("TableOperations::_get_auth_token") 3231 def _get_auth_token( 3232 self, 3233 ): 3234 authtoken = None 3235 3236 # Get access token from environment variable if available 3237 # Primarily useful for testing environments, with other possible usefulness for containers 3238 env_access_token = os.getenv("SYNAPSE_ACCESS_TOKEN") 3239 if env_access_token: 3240 authtoken = env_access_token 3241 return authtoken 3242 3243 # Get token from authorization header 3244 # Primarily useful for API endpoint functionality 3245 if "Authorization" in self.synStore.syn.default_headers: 3246 authtoken = self.synStore.syn.default_headers["Authorization"].split( 3247 "Bearer " 3248 )[-1] 3249 return authtoken 3250 3251 # retrive credentials from synapse object 3252 # Primarily useful for local users, could only be stored here when a .synapseConfig file is used, but including to be safe 3253 synapse_object_creds = self.synStore.syn.credentials 3254 if hasattr(synapse_object_creds, "_token"): 3255 authtoken = synapse_object_creds.secret 3256 3257 # Try getting creds from .synapseConfig file if it exists 3258 # Primarily useful for local users. Seems to correlate with credentials stored in synaspe object when logged in 3259 if os.path.exists(CONFIG.synapse_configuration_path): 3260 config = get_config_file(CONFIG.synapse_configuration_path) 3261 3262 # check which credentials are provided in file 3263 if config.has_option("authentication", "authtoken"): 3264 authtoken = config.get("authentication", "authtoken") 3265 3266 # raise error if required credentials are not found 3267 if not authtoken: 3268 raise NameError( 3269 "authtoken credentials could not be found in the environment, synapse object, or the .synapseConfig file" 3270 ) 3271 3272 return authtoken 3273 3274 @tracer.start_as_current_span("TableOperations::upsertTable") 3275 def upsertTable(self, dmge: DataModelGraphExplorer): 3276 """ 3277 Method to upsert rows from a new manifest into an existing table on synapse 3278 For upsert functionality to work, primary keys must follow the naming convention of <componenet>_id 3279 `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality. 3280 Currently it is required to use -tcn "display label" with table upserts. 3281 3282 3283 Args: 3284 dmge: DataModelGraphExplorer instance 3285 3286 Returns: 3287 existingTableId: synID of the already existing table that had its metadata replaced 3288 """ 3289 3290 authtoken = self._get_auth_token() 3291 3292 synapseDB = SynapseDatabase( 3293 auth_token=authtoken, 3294 project_id=self.synStore.getDatasetProject(self.datasetId), 3295 syn=self.synStore.syn, 3296 synapse_entity_tracker=self.synapse_entity_tracker, 3297 ) 3298 3299 try: 3300 # Try performing upsert 3301 synapseDB.upsert_table_rows( 3302 table_name=self.tableName, data=self.tableToLoad 3303 ) 3304 except SynapseHTTPError as ex: 3305 # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload 3306 if "Id is not a valid column name or id" in str(ex): 3307 self._update_table_uuid_column(dmge) 3308 synapseDB.upsert_table_rows( 3309 table_name=self.tableName, data=self.tableToLoad 3310 ) 3311 # Raise if other error 3312 else: 3313 raise ex 3314 3315 return self.existingTableId 3316 3317 @tracer.start_as_current_span("TableOperations::_update_table_uuid_column") 3318 def _update_table_uuid_column( 3319 self, 3320 dmge: DataModelGraphExplorer, 3321 ) -> None: 3322 """Removes the `Uuid` column when present, and relpaces with an `Id` column 3323 Used to enable backwards compatability for manifests using the old `Uuid` convention 3324 3325 Args: 3326 dmge: DataModelGraphExplorer instance 3327 3328 Returns: 3329 None 3330 """ 3331 3332 # Get the columns of the schema 3333 schema = self.synapse_entity_tracker.get( 3334 synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False 3335 ) 3336 3337 cols = self.synStore.syn.getTableColumns(schema) 3338 3339 # Iterate through columns until `Uuid` column is found 3340 for col in cols: 3341 if col.name.lower() == "uuid": 3342 # See if schema has `Uuid` column specified 3343 try: 3344 uuid_col_in_schema = dmge.is_class_in_schema(col.name) 3345 except KeyError: 3346 uuid_col_in_schema = False 3347 3348 # If there is, then create a new `Id` column from scratch 3349 if uuid_col_in_schema: 3350 new_col = Column(columnType="STRING", maximumSize=64, name="Id") 3351 schema.addColumn(new_col) 3352 schema = self.synStore.syn.store(schema) 3353 # self.synapse_entity_tracker.add(synapse_id=schema.id, entity=schema) 3354 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3355 self.synapse_entity_tracker.remove(synapse_id=schema.id) 3356 # If there is not, then use the old `Uuid` column as a basis for the new `Id` column 3357 else: 3358 # Build ColumnModel that will be used for new column 3359 id_column = Column( 3360 name="Id", 3361 columnType="STRING", 3362 maximumSize=64, 3363 defaultValue=None, 3364 maximumListLength=1, 3365 ) 3366 new_col_response = self.synStore.syn.store(id_column) 3367 3368 # Define columnChange body 3369 columnChangeDict = { 3370 "concreteType": "org.sagebionetworks.repo.model.table.TableSchemaChangeRequest", 3371 "entityId": self.existingTableId, 3372 "changes": [ 3373 { 3374 "oldColumnId": col["id"], 3375 "newColumnId": new_col_response["id"], 3376 } 3377 ], 3378 } 3379 3380 self.synStore.syn._async_table_update( 3381 table=self.existingTableId, 3382 changes=[columnChangeDict], 3383 wait=False, 3384 ) 3385 break 3386 3387 return 3388 3389 @tracer.start_as_current_span("TableOperations::updateTable") 3390 def updateTable( 3391 self, 3392 update_col: str = "Id", 3393 ): 3394 """ 3395 Method to update an existing table with a new column 3396 3397 Args: 3398 updateCol: column to index the old and new tables on 3399 3400 Returns: 3401 existingTableId: synID of the already existing table that had its metadata replaced 3402 """ 3403 existing_table, existing_results = self.synStore.get_synapse_table( 3404 self.existingTableId 3405 ) 3406 3407 self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col) 3408 # store table with existing etag data and impose restrictions as appropriate 3409 table_result = self.synStore.syn.store( 3410 Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag), 3411 isRestricted=self.restrict, 3412 ) 3413 # We cannot store the Table to the `synapse_entity_tracker` because there is 3414 # not `Schema` on the table object. The above `.store()` function call would 3415 # also update the ETag of the entity within Synapse. Remove it from the tracker 3416 # and re-retrieve it later on if needed again. 3417 self.synapse_entity_tracker.remove(synapse_id=table_result.tableId) 3418 3419 return self.existingTableId
Object to hold functions for various table operations specific to the Synapse Asset Store.
Currently implement operations are: createTable: upload a manifest as a new table when none exist replaceTable: replace a metadata in a table from one manifest with metadata from another manifest updateTable: add a column to a table that already exists on synapse
Operations currently in development are: upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest
3012 def __init__( 3013 self, 3014 synStore: SynapseStorage, 3015 tableToLoad: pd.DataFrame = None, 3016 tableName: str = None, 3017 datasetId: str = None, 3018 existingTableId: str = None, 3019 restrict: bool = False, 3020 synapse_entity_tracker: SynapseEntityTracker = None, 3021 ): 3022 """ 3023 Class governing table operations (creation, replacement, upserts, updates) in schematic 3024 3025 tableToLoad: manifest formatted appropriately for the table 3026 tableName: name of the table to be uploaded 3027 datasetId: synID of the dataset for the manifest 3028 existingTableId: synId of the table currently exising on synapse (if there is one) 3029 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 3030 synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities 3031 3032 """ 3033 self.synStore = synStore 3034 self.tableToLoad = tableToLoad 3035 self.tableName = tableName 3036 self.datasetId = datasetId 3037 self.existingTableId = existingTableId 3038 self.restrict = restrict 3039 self.synapse_entity_tracker = synapse_entity_tracker or SynapseEntityTracker()
Class governing table operations (creation, replacement, upserts, updates) in schematic
tableToLoad: manifest formatted appropriately for the table tableName: name of the table to be uploaded datasetId: synID of the dataset for the manifest existingTableId: synId of the table currently exising on synapse (if there is one) restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
3041 @tracer.start_as_current_span("TableOperations::createTable") 3042 def createTable( 3043 self, 3044 columnTypeDict: dict = None, 3045 specifySchema: bool = True, 3046 ): 3047 """ 3048 Method to create a table from a metadata manifest and upload it to synapse 3049 3050 Args: 3051 columnTypeDict: dictionary schema for table columns: type, size, etc 3052 specifySchema: to specify a specific schema for the table format 3053 3054 Returns: 3055 table.schema.id: synID of the newly created table 3056 """ 3057 datasetEntity = self.synapse_entity_tracker.get( 3058 synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False 3059 ) 3060 datasetName = datasetEntity.name 3061 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3062 3063 if not self.tableName: 3064 self.tableName = datasetName + "table" 3065 datasetParentProject = self.synStore.getDatasetProject(self.datasetId) 3066 if specifySchema: 3067 if columnTypeDict == {}: 3068 logger.error("Did not provide a columnTypeDict.") 3069 # create list of columns: 3070 cols = [] 3071 for col in self.tableToLoad.columns: 3072 if col in table_schema_by_cname: 3073 col_type = table_schema_by_cname[col]["columnType"] 3074 max_size = ( 3075 table_schema_by_cname[col]["maximumSize"] 3076 if "maximumSize" in table_schema_by_cname[col].keys() 3077 else 100 3078 ) 3079 max_list_len = 250 3080 if max_size and max_list_len: 3081 cols.append( 3082 Column( 3083 name=col, 3084 columnType=col_type, 3085 maximumSize=max_size, 3086 maximumListLength=max_list_len, 3087 ) 3088 ) 3089 elif max_size: 3090 cols.append( 3091 Column(name=col, columnType=col_type, maximumSize=max_size) 3092 ) 3093 else: 3094 cols.append(Column(name=col, columnType=col_type)) 3095 else: 3096 # TODO add warning that the given col was not found and it's max size is set to 100 3097 cols.append(Column(name=col, columnType="STRING", maximumSize=100)) 3098 schema = Schema( 3099 name=self.tableName, columns=cols, parent=datasetParentProject 3100 ) 3101 table = Table(schema, self.tableToLoad) 3102 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3103 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3104 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3105 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3106 return table.schema.id 3107 else: 3108 # For just uploading the tables to synapse using default 3109 # column types. 3110 table = build_table(self.tableName, datasetParentProject, self.tableToLoad) 3111 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3112 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3113 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3114 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3115 return table.schema.id
Method to create a table from a metadata manifest and upload it to synapse
Arguments:
- columnTypeDict: dictionary schema for table columns: type, size, etc
- specifySchema: to specify a specific schema for the table format
Returns:
table.schema.id: synID of the newly created table
3117 @tracer.start_as_current_span("TableOperations::replaceTable") 3118 def replaceTable( 3119 self, 3120 specifySchema: bool = True, 3121 columnTypeDict: dict = None, 3122 ): 3123 """ 3124 Method to replace an existing table on synapse with metadata from a new manifest 3125 3126 Args: 3127 specifySchema: to infer a schema for the table format 3128 columnTypeDict: dictionary schema for table columns: type, size, etc 3129 3130 Returns: 3131 existingTableId: synID of the already existing table that had its metadata replaced 3132 """ 3133 datasetEntity = self.synapse_entity_tracker.get( 3134 synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False 3135 ) 3136 3137 datasetName = datasetEntity.name 3138 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3139 existing_table, existing_results = self.synStore.get_synapse_table( 3140 self.existingTableId 3141 ) 3142 # remove rows 3143 self.synStore.syn.delete(existing_results) 3144 # Data changes such as removing all rows causes the eTag to change. 3145 self.synapse_entity_tracker.remove(synapse_id=self.existingTableId) 3146 # wait for row deletion to finish on synapse before getting empty table 3147 sleep(10) 3148 3149 # removes all current columns 3150 current_table = self.synapse_entity_tracker.get( 3151 synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False 3152 ) 3153 3154 current_columns = self.synStore.syn.getTableColumns(current_table) 3155 3156 for col in current_columns: 3157 current_table.removeColumn(col) 3158 3159 if not self.tableName: 3160 self.tableName = datasetName + "table" 3161 3162 # Process columns according to manifest entries 3163 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3164 datasetParentProject = self.synStore.getDatasetProject(self.datasetId) 3165 if specifySchema: 3166 if columnTypeDict == {}: 3167 logger.error("Did not provide a columnTypeDict.") 3168 # create list of columns: 3169 cols = [] 3170 3171 for col in self.tableToLoad.columns: 3172 if col in table_schema_by_cname: 3173 col_type = table_schema_by_cname[col]["columnType"] 3174 max_size = ( 3175 table_schema_by_cname[col]["maximumSize"] 3176 if "maximumSize" in table_schema_by_cname[col].keys() 3177 else 100 3178 ) 3179 max_list_len = 250 3180 if max_size and max_list_len: 3181 cols.append( 3182 Column( 3183 name=col, 3184 columnType=col_type, 3185 maximumSize=max_size, 3186 maximumListLength=max_list_len, 3187 ) 3188 ) 3189 elif max_size: 3190 cols.append( 3191 Column(name=col, columnType=col_type, maximumSize=max_size) 3192 ) 3193 else: 3194 cols.append(Column(name=col, columnType=col_type)) 3195 else: 3196 # TODO add warning that the given col was not found and it's max size is set to 100 3197 cols.append(Column(name=col, columnType="STRING", maximumSize=100)) 3198 3199 # adds new columns to schema 3200 for col in cols: 3201 current_table.addColumn(col) 3202 3203 table_result = self.synStore.syn.store( 3204 current_table, isRestricted=self.restrict 3205 ) 3206 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3207 # self.synapse_entity_tracker.add(synapse_id=table_result.schema.id, entity=table_result.schema) 3208 self.synapse_entity_tracker.remove(synapse_id=table_result.id) 3209 3210 # wait for synapse store to finish 3211 sleep(1) 3212 3213 # build schema and table from columns and store with necessary restrictions 3214 schema = Schema( 3215 name=self.tableName, columns=cols, parent=datasetParentProject 3216 ) 3217 schema.id = self.existingTableId 3218 table = Table(schema, self.tableToLoad, etag=existing_results.etag) 3219 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3220 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3221 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3222 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3223 else: 3224 logging.error("Must specify a schema for table replacements") 3225 3226 # remove system metadata from manifest 3227 existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True) 3228 return self.existingTableId
Method to replace an existing table on synapse with metadata from a new manifest
Arguments:
- specifySchema: to infer a schema for the table format
- columnTypeDict: dictionary schema for table columns: type, size, etc
Returns:
existingTableId: synID of the already existing table that had its metadata replaced
3274 @tracer.start_as_current_span("TableOperations::upsertTable") 3275 def upsertTable(self, dmge: DataModelGraphExplorer): 3276 """ 3277 Method to upsert rows from a new manifest into an existing table on synapse 3278 For upsert functionality to work, primary keys must follow the naming convention of <componenet>_id 3279 `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality. 3280 Currently it is required to use -tcn "display label" with table upserts. 3281 3282 3283 Args: 3284 dmge: DataModelGraphExplorer instance 3285 3286 Returns: 3287 existingTableId: synID of the already existing table that had its metadata replaced 3288 """ 3289 3290 authtoken = self._get_auth_token() 3291 3292 synapseDB = SynapseDatabase( 3293 auth_token=authtoken, 3294 project_id=self.synStore.getDatasetProject(self.datasetId), 3295 syn=self.synStore.syn, 3296 synapse_entity_tracker=self.synapse_entity_tracker, 3297 ) 3298 3299 try: 3300 # Try performing upsert 3301 synapseDB.upsert_table_rows( 3302 table_name=self.tableName, data=self.tableToLoad 3303 ) 3304 except SynapseHTTPError as ex: 3305 # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload 3306 if "Id is not a valid column name or id" in str(ex): 3307 self._update_table_uuid_column(dmge) 3308 synapseDB.upsert_table_rows( 3309 table_name=self.tableName, data=self.tableToLoad 3310 ) 3311 # Raise if other error 3312 else: 3313 raise ex 3314 3315 return self.existingTableId
Method to upsert rows from a new manifest into an existing table on synapse
For upsert functionality to work, primary keys must follow the naming convention of -tm upsert
should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality.
Currently it is required to use -tcn "display label" with table upserts.
Arguments:
- dmge: DataModelGraphExplorer instance
Returns:
existingTableId: synID of the already existing table that had its metadata replaced
3389 @tracer.start_as_current_span("TableOperations::updateTable") 3390 def updateTable( 3391 self, 3392 update_col: str = "Id", 3393 ): 3394 """ 3395 Method to update an existing table with a new column 3396 3397 Args: 3398 updateCol: column to index the old and new tables on 3399 3400 Returns: 3401 existingTableId: synID of the already existing table that had its metadata replaced 3402 """ 3403 existing_table, existing_results = self.synStore.get_synapse_table( 3404 self.existingTableId 3405 ) 3406 3407 self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col) 3408 # store table with existing etag data and impose restrictions as appropriate 3409 table_result = self.synStore.syn.store( 3410 Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag), 3411 isRestricted=self.restrict, 3412 ) 3413 # We cannot store the Table to the `synapse_entity_tracker` because there is 3414 # not `Schema` on the table object. The above `.store()` function call would 3415 # also update the ETag of the entity within Synapse. Remove it from the tracker 3416 # and re-retrieve it later on if needed again. 3417 self.synapse_entity_tracker.remove(synapse_id=table_result.tableId) 3418 3419 return self.existingTableId
Method to update an existing table with a new column
Arguments:
- updateCol: column to index the old and new tables on
Returns:
existingTableId: synID of the already existing table that had its metadata replaced
3422class DatasetFileView: 3423 """Helper class to create temporary dataset file views. 3424 This class can be used in conjunction with a 'with' statement. 3425 This will ensure that the file view is deleted automatically. 3426 See SynapseStorage.getDatasetAnnotationsBatch for example usage. 3427 """ 3428 3429 def __init__( 3430 self, 3431 datasetId: str, 3432 synapse: Synapse, 3433 name: str = None, 3434 temporary: bool = True, 3435 parentId: str = None, 3436 ) -> None: 3437 """Create a file view scoped to a dataset folder. 3438 3439 Args: 3440 datasetId (str): Synapse ID for a dataset folder/project. 3441 synapse (Synapse): Used for Synapse requests. 3442 name (str): Name of the file view (temporary or not). 3443 temporary (bool): Whether to delete the file view on exit 3444 of either a 'with' statement or Python entirely. 3445 parentId (str, optional): Synapse ID specifying where to 3446 store the file view. Defaults to datasetId. 3447 """ 3448 3449 self.datasetId = datasetId 3450 self.synapse = synapse 3451 self.is_temporary = temporary 3452 3453 if name is None: 3454 self.name = f"schematic annotation file view for {self.datasetId}" 3455 3456 if self.is_temporary: 3457 uid = secrets.token_urlsafe(5) 3458 self.name = f"{self.name} - UID {uid}" 3459 3460 # TODO: Allow a DCC admin to configure a "universal parent" 3461 # Such as a Synapse project writeable by everyone. 3462 self.parentId = datasetId if parentId is None else parentId 3463 3464 # TODO: Create local sharing setting to hide from everyone else 3465 view_schema = EntityViewSchema( 3466 name=self.name, 3467 parent=self.parentId, 3468 scopes=self.datasetId, 3469 includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER], 3470 addDefaultViewColumns=False, 3471 addAnnotationColumns=True, 3472 ) 3473 3474 # TODO: Handle failure due to insufficient permissions by 3475 # creating a temporary new project to store view 3476 self.view_schema = self.synapse.store(view_schema) 3477 3478 # These are filled in after calling `self.query()` 3479 self.results = None 3480 self.table = None 3481 3482 # Ensure deletion of the file view (last resort) 3483 if self.is_temporary: 3484 atexit.register(self.delete) 3485 3486 def __enter__(self): 3487 """Return file view when entering 'with' statement.""" 3488 return self 3489 3490 def __exit__(self, exc_type, exc_value, traceback): 3491 """Delete file view when exiting 'with' statement.""" 3492 if self.is_temporary: 3493 self.delete() 3494 3495 def delete(self): 3496 """Delete the file view on Synapse without deleting local table.""" 3497 if self.view_schema is not None: 3498 self.synapse.delete(self.view_schema) 3499 self.view_schema = None 3500 3501 def query(self, tidy=True, force=False): 3502 """Retrieve file view as a data frame (raw format sans index).""" 3503 if self.table is None or force: 3504 fileview_id = self.view_schema["id"] 3505 self.results = self.synapse.tableQuery(f"select * from {fileview_id}") 3506 self.table = self.results.asDataFrame( 3507 rowIdAndVersionInIndex=False, 3508 na_values=STR_NA_VALUES_FILTERED, 3509 keep_default_na=False, 3510 ) 3511 if tidy: 3512 self.tidy_table() 3513 return self.table 3514 3515 def tidy_table(self): 3516 """Convert raw file view data frame into more usable format.""" 3517 assert self.table is not None, "Must call `self.query()` first." 3518 self._fix_default_columns() 3519 self._fix_list_columns() 3520 self._fix_int_columns() 3521 return self.table 3522 3523 def _fix_default_columns(self): 3524 """Rename default columns to match schematic expectations.""" 3525 3526 # Drop ROW_VERSION column if present 3527 if "ROW_VERSION" in self.table: 3528 del self.table["ROW_VERSION"] 3529 3530 # Rename id column to entityId and set as data frame index 3531 if "ROW_ID" in self.table: 3532 self.table["entityId"] = "syn" + self.table["ROW_ID"].astype(str) 3533 self.table = self.table.set_index("entityId", drop=False) 3534 del self.table["ROW_ID"] 3535 3536 # Rename ROW_ETAG column to eTag and place at end of data frame 3537 if "ROW_ETAG" in self.table: 3538 row_etags = self.table.pop("ROW_ETAG") 3539 3540 # eTag column may already present if users annotated data without submitting manifest 3541 # we're only concerned with the new values and not the existing ones 3542 if "eTag" in self.table: 3543 del self.table["eTag"] 3544 3545 self.table.insert(len(self.table.columns), "eTag", row_etags) 3546 3547 return self.table 3548 3549 def _get_columns_of_type(self, types): 3550 """Helper function to get list of columns of a given type(s).""" 3551 matching_columns = [] 3552 for header in self.results.headers: 3553 if header.columnType in types: 3554 matching_columns.append(header.name) 3555 return matching_columns 3556 3557 def _fix_list_columns(self): 3558 """Fix formatting of list-columns.""" 3559 list_types = {"STRING_LIST", "INTEGER_LIST", "BOOLEAN_LIST"} 3560 list_columns = self._get_columns_of_type(list_types) 3561 for col in list_columns: 3562 self.table[col] = self.table[col].apply(lambda x: ", ".join(x)) 3563 return self.table 3564 3565 def _fix_int_columns(self): 3566 """Ensure that integer-columns are actually integers.""" 3567 int_columns = self._get_columns_of_type({"INTEGER"}) 3568 for col in int_columns: 3569 # Coercing to string because NaN is a floating point value 3570 # and cannot exist alongside integers in a column 3571 def to_int_fn(x): 3572 return "" if np.isnan(x) else str(int(x)) 3573 3574 self.table[col] = self.table[col].apply(to_int_fn) 3575 return self.table
Helper class to create temporary dataset file views. This class can be used in conjunction with a 'with' statement. This will ensure that the file view is deleted automatically. See SynapseStorage.getDatasetAnnotationsBatch for example usage.
3429 def __init__( 3430 self, 3431 datasetId: str, 3432 synapse: Synapse, 3433 name: str = None, 3434 temporary: bool = True, 3435 parentId: str = None, 3436 ) -> None: 3437 """Create a file view scoped to a dataset folder. 3438 3439 Args: 3440 datasetId (str): Synapse ID for a dataset folder/project. 3441 synapse (Synapse): Used for Synapse requests. 3442 name (str): Name of the file view (temporary or not). 3443 temporary (bool): Whether to delete the file view on exit 3444 of either a 'with' statement or Python entirely. 3445 parentId (str, optional): Synapse ID specifying where to 3446 store the file view. Defaults to datasetId. 3447 """ 3448 3449 self.datasetId = datasetId 3450 self.synapse = synapse 3451 self.is_temporary = temporary 3452 3453 if name is None: 3454 self.name = f"schematic annotation file view for {self.datasetId}" 3455 3456 if self.is_temporary: 3457 uid = secrets.token_urlsafe(5) 3458 self.name = f"{self.name} - UID {uid}" 3459 3460 # TODO: Allow a DCC admin to configure a "universal parent" 3461 # Such as a Synapse project writeable by everyone. 3462 self.parentId = datasetId if parentId is None else parentId 3463 3464 # TODO: Create local sharing setting to hide from everyone else 3465 view_schema = EntityViewSchema( 3466 name=self.name, 3467 parent=self.parentId, 3468 scopes=self.datasetId, 3469 includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER], 3470 addDefaultViewColumns=False, 3471 addAnnotationColumns=True, 3472 ) 3473 3474 # TODO: Handle failure due to insufficient permissions by 3475 # creating a temporary new project to store view 3476 self.view_schema = self.synapse.store(view_schema) 3477 3478 # These are filled in after calling `self.query()` 3479 self.results = None 3480 self.table = None 3481 3482 # Ensure deletion of the file view (last resort) 3483 if self.is_temporary: 3484 atexit.register(self.delete)
Create a file view scoped to a dataset folder.
Arguments:
- datasetId (str): Synapse ID for a dataset folder/project.
- synapse (Synapse): Used for Synapse requests.
- name (str): Name of the file view (temporary or not).
- temporary (bool): Whether to delete the file view on exit of either a 'with' statement or Python entirely.
- parentId (str, optional): Synapse ID specifying where to store the file view. Defaults to datasetId.
3495 def delete(self): 3496 """Delete the file view on Synapse without deleting local table.""" 3497 if self.view_schema is not None: 3498 self.synapse.delete(self.view_schema) 3499 self.view_schema = None
Delete the file view on Synapse without deleting local table.
3501 def query(self, tidy=True, force=False): 3502 """Retrieve file view as a data frame (raw format sans index).""" 3503 if self.table is None or force: 3504 fileview_id = self.view_schema["id"] 3505 self.results = self.synapse.tableQuery(f"select * from {fileview_id}") 3506 self.table = self.results.asDataFrame( 3507 rowIdAndVersionInIndex=False, 3508 na_values=STR_NA_VALUES_FILTERED, 3509 keep_default_na=False, 3510 ) 3511 if tidy: 3512 self.tidy_table() 3513 return self.table
Retrieve file view as a data frame (raw format sans index).
3515 def tidy_table(self): 3516 """Convert raw file view data frame into more usable format.""" 3517 assert self.table is not None, "Must call `self.query()` first." 3518 self._fix_default_columns() 3519 self._fix_list_columns() 3520 self._fix_int_columns() 3521 return self.table
Convert raw file view data frame into more usable format.