schematic.store.synapse
Synapse storage class
1"""Synapse storage class""" 2 3import asyncio 4import atexit 5import logging 6import os 7import re 8import secrets 9import shutil 10import time 11import uuid # used to generate unique names for entities 12from copy import deepcopy 13from dataclasses import dataclass, field 14from time import sleep 15 16# allows specifying explicit variable types 17from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union 18 19import numpy as np 20import pandas as pd 21import synapseclient 22from opentelemetry import trace 23from synapseclient import Annotations as OldAnnotations 24from synapseclient import ( 25 Column, 26 EntityViewSchema, 27 EntityViewType, 28 File, 29 Folder, 30 Schema, 31 Synapse, 32 Table, 33 as_table_columns, 34) 35from synapseclient.annotations import _convert_to_annotations_list 36from synapseclient.api import get_config_file, get_entity_id_bundle2 37from synapseclient.core.constants.concrete_types import PROJECT_ENTITY 38from synapseclient.core.exceptions import ( 39 SynapseAuthenticationError, 40 SynapseHTTPError, 41 SynapseUnmetAccessRestrictions, 42) 43from synapseclient.models.annotations import Annotations 44from synapseclient.table import CsvFileTable, Schema, build_table 45from tenacity import ( 46 retry, 47 retry_if_exception_type, 48 stop_after_attempt, 49 wait_chain, 50 wait_fixed, 51) 52 53from schematic.configuration.configuration import CONFIG 54from schematic.exceptions import AccessCredentialsError 55from schematic.schemas.data_model_graph import DataModelGraphExplorer 56from schematic.store.base import BaseStorage 57from schematic.store.database.synapse_database import SynapseDatabase 58from schematic.store.synapse_tracker import SynapseEntityTracker 59from schematic.utils.df_utils import ( 60 STR_NA_VALUES_FILTERED, 61 col_in_dataframe, 62 load_df, 63 update_df, 64) 65 66# entity_type_mapping, get_dir_size, create_temp_folder, check_synapse_cache_size, and clear_synapse_cache functions are used for AWS deployment 67# Please do not remove these import statements 68from schematic.utils.general import ( 69 check_synapse_cache_size, 70 clear_synapse_cache, 71 create_temp_folder, 72 entity_type_mapping, 73 get_dir_size, 74) 75from schematic.utils.io_utils import cleanup_temporary_storage 76from schematic.utils.schema_utils import get_class_label_from_display_name 77from schematic.utils.validate_utils import comma_separated_list_regex, rule_in_rule_list 78 79 80logger = logging.getLogger("Synapse storage") 81 82tracer = trace.get_tracer("Schematic") 83 84 85@dataclass 86class ManifestDownload(object): 87 """ 88 syn: an object of type synapseclient. 89 manifest_id: id of a manifest 90 synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities 91 """ 92 93 syn: synapseclient.Synapse 94 manifest_id: str 95 synapse_entity_tracker: SynapseEntityTracker = field( 96 default_factory=SynapseEntityTracker 97 ) 98 99 def _download_manifest_to_folder(self, use_temporary_folder: bool = True) -> File: 100 """ 101 Try downloading a manifest to a specific folder (temporary or not). When the 102 `use_temporary_folder` is set to True, the manifest will be downloaded to a 103 temporary folder. This is useful for when the code is running as an API server 104 where multiple requests are being made at the same time. This will prevent 105 multiple requests from overwriting the same manifest file. When the 106 `use_temporary_folder` is set to False, the manifest will be downloaded to the 107 default manifest folder. 108 109 Args: 110 use_temporary_folder: boolean argument indicating if a temporary folder 111 should be used to store the manifest file. This is useful when running 112 this code as an API server where multiple requests could be made at the 113 same time. This is set to False when the code is being used from the 114 CLI. Defaults to True. 115 116 Return: 117 manifest_data: A Synapse file entity of the downloaded manifest 118 """ 119 manifest_data = self.synapse_entity_tracker.get( 120 synapse_id=self.manifest_id, 121 syn=self.syn, 122 download_file=False, 123 retrieve_if_not_present=False, 124 ) 125 current_span = trace.get_current_span() 126 if ( 127 manifest_data 128 and (file_handle := manifest_data.get("_file_handle", None)) 129 and current_span.is_recording() 130 ): 131 current_span.set_attribute( 132 "schematic.manifest_size", file_handle.get("contentSize", 0) 133 ) 134 135 if manifest_data and manifest_data.path: 136 return manifest_data 137 138 if "SECRETS_MANAGER_SECRETS" in os.environ: 139 temporary_manifest_storage = "/var/tmp/temp_manifest_download" 140 cleanup_temporary_storage( 141 temporary_manifest_storage, time_delta_seconds=3600 142 ) 143 # create a new directory to store manifest 144 if not os.path.exists(temporary_manifest_storage): 145 os.mkdir(temporary_manifest_storage) 146 # create temporary folders for storing manifests 147 download_location = create_temp_folder( 148 path=temporary_manifest_storage, 149 prefix=f"{self.manifest_id}-{time.time()}-", 150 ) 151 else: 152 if use_temporary_folder: 153 download_location = create_temp_folder( 154 path=CONFIG.manifest_folder, 155 prefix=f"{self.manifest_id}-{time.time()}-", 156 ) 157 else: 158 download_location = CONFIG.manifest_folder 159 160 manifest_data = self.synapse_entity_tracker.get( 161 synapse_id=self.manifest_id, 162 syn=self.syn, 163 download_file=True, 164 retrieve_if_not_present=True, 165 download_location=download_location, 166 ) 167 168 # This is doing a rename of the downloaded file. The reason this is important 169 # is that if we are re-using a file that was previously downloaded, but the 170 # file had been renamed. The file downloaded from the Synapse client is just 171 # a direct copy of that renamed file. This code will set the name of the file 172 # to the original name that was used to download the file. Note: An MD5 checksum 173 # of the file will still be performed so if the file has changed, it will be 174 # downloaded again. 175 filename = manifest_data._file_handle.fileName 176 if filename != os.path.basename(manifest_data.path): 177 parent_folder = os.path.dirname(manifest_data.path) 178 manifest_original_name_and_path = os.path.join(parent_folder, filename) 179 180 self.syn.cache.remove( 181 file_handle_id=manifest_data.dataFileHandleId, path=manifest_data.path 182 ) 183 os.rename(manifest_data.path, manifest_original_name_and_path) 184 manifest_data.path = manifest_original_name_and_path 185 self.syn.cache.add( 186 file_handle_id=manifest_data.dataFileHandleId, 187 path=manifest_original_name_and_path, 188 md5=manifest_data._file_handle.contentMd5, 189 ) 190 191 return manifest_data 192 193 def _entity_type_checking(self) -> str: 194 """ 195 check the entity type of the id that needs to be downloaded 196 Return: 197 if the entity type is wrong, raise an error 198 """ 199 # check the type of entity 200 entity_type = entity_type_mapping( 201 syn=self.syn, 202 entity_id=self.manifest_id, 203 synapse_entity_tracker=self.synapse_entity_tracker, 204 ) 205 if entity_type != "file": 206 logger.error( 207 f"You are using entity type: {entity_type}. Please provide a file ID" 208 ) 209 210 def download_manifest( 211 self, 212 newManifestName: str = "", 213 manifest_df: pd.DataFrame = pd.DataFrame(), 214 use_temporary_folder: bool = True, 215 ) -> Union[str, File]: 216 """ 217 Download a manifest based on a given manifest id. 218 Args: 219 newManifestName(optional): new name of a manifest that gets downloaded. 220 manifest_df(optional): a dataframe containing name and id of manifests in a given asset view 221 Return: 222 manifest_data: synapse entity file object 223 """ 224 225 # enables retrying if user does not have access to uncensored manifest 226 # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location 227 manifest_data = "" 228 229 # check entity type 230 self._entity_type_checking() 231 232 # download a manifest 233 try: 234 manifest_data = self._download_manifest_to_folder( 235 use_temporary_folder=use_temporary_folder 236 ) 237 except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError): 238 # if there's an error getting an uncensored manifest, try getting the censored manifest 239 if not manifest_df.empty: 240 censored_regex = re.compile(".*censored.*") 241 censored = manifest_df["name"].str.contains(censored_regex) 242 new_manifest_id = manifest_df[censored]["id"][0] 243 self.manifest_id = new_manifest_id 244 try: 245 manifest_data = self._download_manifest_to_folder( 246 use_temporary_folder=use_temporary_folder 247 ) 248 except ( 249 SynapseUnmetAccessRestrictions, 250 SynapseAuthenticationError, 251 ) as e: 252 raise PermissionError( 253 "You don't have access to censored and uncensored manifests in this dataset." 254 ) from e 255 else: 256 logger.error( 257 f"You don't have access to the requested resource: {self.manifest_id}" 258 ) 259 260 if newManifestName and os.path.exists(manifest_data.get("path")): 261 # Rename the file we just made to the new name 262 new_manifest_filename = newManifestName + ".csv" 263 264 # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest. 265 parent_folder = os.path.dirname(manifest_data.get("path")) 266 267 new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename) 268 269 # Copy file to new location. The purpose of using a copy instead of a rename 270 # is to avoid any potential issues with the file being used in another 271 # process. This avoids any potential race or code cocurrency conditions. 272 shutil.copyfile(src=manifest_data["path"], dst=new_manifest_path_name) 273 274 # Adding this to cache will allow us to re-use the already downloaded 275 # manifest file for up to 1 hour. 276 self.syn.cache.add( 277 file_handle_id=manifest_data.dataFileHandleId, 278 path=new_manifest_path_name, 279 md5=manifest_data._file_handle.contentMd5, 280 ) 281 282 # Update file names/paths in manifest_data 283 manifest_data["name"] = new_manifest_filename 284 manifest_data["filename"] = new_manifest_filename 285 manifest_data["path"] = new_manifest_path_name 286 287 return manifest_data 288 289 290class SynapseStorage(BaseStorage): 291 """Implementation of Storage interface for datasets/files stored on Synapse. 292 Provides utilities to list files in a specific project; update files annotations, create fileviews, etc. 293 294 TODO: Need to define the interface and rename and/or refactor some of the methods below. 295 """ 296 297 @tracer.start_as_current_span("SynapseStorage::__init__") 298 def __init__( 299 self, 300 token: Optional[str] = None, # optional parameter retrieved from browser cookie 301 access_token: Optional[str] = None, 302 project_scope: Optional[list] = None, 303 synapse_cache_path: Optional[str] = None, 304 perform_query: Optional[bool] = True, 305 columns: Optional[list] = None, 306 where_clauses: Optional[list] = None, 307 ) -> None: 308 """Initializes a SynapseStorage object. 309 310 Args: 311 token (Optional[str], optional): 312 Optional token parameter as found in browser cookie upon login to synapse. 313 Defaults to None. 314 access_token (Optional[list], optional): 315 Optional access token (personal or oauth). 316 Defaults to None. 317 project_scope (Optional[list], optional): Defaults to None. 318 synapse_cache_path (Optional[str], optional): 319 Location of synapse cache. 320 Defaults to None. 321 TODO: 322 Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how `query_fileview` is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands. 323 """ 324 self.syn = self.login(synapse_cache_path, access_token) 325 self.project_scope = project_scope 326 self.storageFileview = CONFIG.synapse_master_fileview_id 327 self.manifest = CONFIG.synapse_manifest_basename 328 self.root_synapse_cache = self.syn.cache.cache_root_dir 329 self.synapse_entity_tracker = SynapseEntityTracker() 330 if perform_query: 331 self.query_fileview(columns=columns, where_clauses=where_clauses) 332 333 # TODO: When moving this over to a regular cron-job the following logic should be 334 # out of `manifest_download`: 335 # if "SECRETS_MANAGER_SECRETS" in os.environ: 336 # temporary_manifest_storage = "/var/tmp/temp_manifest_download" 337 # cleanup_temporary_storage(temporary_manifest_storage, time_delta_seconds=3600) 338 @tracer.start_as_current_span("SynapseStorage::_purge_synapse_cache") 339 def _purge_synapse_cache( 340 self, maximum_storage_allowed_cache_gb: int = 1, minute_buffer: int = 15 341 ) -> None: 342 """ 343 Purge synapse cache if it exceeds a certain size. Default to 1GB. 344 Args: 345 maximum_storage_allowed_cache_gb (int): the maximum storage allowed 346 before purging cache. Default is 1 GB. 347 minute_buffer (int): All files created this amount of time or older will be deleted 348 """ 349 # try clearing the cache 350 # scan a directory and check size of files 351 if os.path.exists(self.root_synapse_cache): 352 maximum_storage_allowed_cache_bytes = maximum_storage_allowed_cache_gb * ( 353 1024**3 354 ) 355 nbytes = get_dir_size(self.root_synapse_cache) 356 dir_size_bytes = check_synapse_cache_size(directory=self.root_synapse_cache) 357 # if 1 GB has already been taken, purge cache before 15 min 358 if dir_size_bytes >= maximum_storage_allowed_cache_bytes: 359 num_of_deleted_files = clear_synapse_cache( 360 self.syn.cache, minutes=minute_buffer 361 ) 362 logger.info( 363 f"{num_of_deleted_files} files have been deleted from {self.root_synapse_cache}" 364 ) 365 else: 366 # on AWS, OS takes around 14-17% of our ephemeral storage (20GiB) 367 # instead of guessing how much space that we left, print out .synapseCache here 368 logger.info(f"the total size of .synapseCache is: {nbytes} bytes") 369 370 @tracer.start_as_current_span("SynapseStorage::query_fileview") 371 def query_fileview( 372 self, 373 columns: Optional[list] = None, 374 where_clauses: Optional[list] = None, 375 force_requery: Optional[bool] = False, 376 ) -> None: 377 """ 378 Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute. 379 Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes. 380 Args: 381 columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns. 382 where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None. 383 force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False. 384 """ 385 self._purge_synapse_cache() 386 387 # Initialize to assume that the new fileview query will be different from what may already be stored. Initializes to True because generally one will not have already been performed 388 self.new_query_different = True 389 390 # If a query has already been performed, store the query 391 previous_query_built = hasattr(self, "fileview_query") 392 if previous_query_built: 393 previous_query = self.fileview_query 394 395 # Build a query with the current given parameters and check to see if it is different from the previous 396 self._build_query(columns=columns, where_clauses=where_clauses) 397 if previous_query_built: 398 self.new_query_different = self.fileview_query != previous_query 399 400 # Only perform the query if it is different from the previous query or we are forcing new results to be retrieved 401 if self.new_query_different or force_requery: 402 try: 403 self.storageFileviewTable = self.syn.tableQuery( 404 query=self.fileview_query, 405 ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) 406 except SynapseHTTPError as exc: 407 exception_text = str(exc) 408 if "Unknown column path" in exception_text: 409 raise ValueError( 410 "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation." 411 ) 412 elif "Unknown column" in exception_text: 413 missing_column = exception_text.split("Unknown column ")[-1] 414 raise ValueError( 415 f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview." 416 ) 417 else: 418 raise AccessCredentialsError(self.storageFileview) 419 420 @staticmethod 421 def build_clause_from_dataset_id( 422 dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None 423 ) -> str: 424 """ 425 Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized. 426 Args: 427 dataset_id: Synapse ID of a dataset that should be used to limit the query 428 dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query 429 Returns: 430 clause for the query or an empty string if no dataset ID is provided 431 """ 432 # Calling this method without specifying synIDs will complete but will not scope the view 433 if (not dataset_id) and (not dataset_folder_list): 434 return "" 435 436 # This will be used to gather files under a dataset recursively with a fileview query instead of walking 437 if dataset_folder_list: 438 search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list) 439 return f"parentId IN ({search_folders})" 440 441 # `dataset_id` should be provided when all files are stored directly under the dataset folder 442 return f"parentId='{dataset_id}'" 443 444 def _build_query( 445 self, columns: Optional[list] = None, where_clauses: Optional[list] = None 446 ): 447 """ 448 Method to build a query for Synapse FileViews 449 Args: 450 columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns. 451 where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None. 452 self.storageFileview (str): Synapse FileView ID 453 self.project_scope (Optional[list], optional): List of project IDs to be used to scope the query. Defaults to None. 454 Gets added to where_clauses, more included for backwards compatability and as a more user friendly way of subsetting the view in a simple way. 455 """ 456 if columns is None: 457 columns = [] 458 if where_clauses is None: 459 where_clauses = [] 460 461 if self.project_scope: 462 project_scope_clause = f"projectId IN {tuple(self.project_scope + [''])}" 463 where_clauses.append(project_scope_clause) 464 465 if where_clauses: 466 where_clauses = " AND ".join(where_clauses) 467 where_clauses = f"WHERE {where_clauses} ;" 468 else: 469 where_clauses = ";" 470 471 if columns: 472 columns = ",".join(columns) 473 else: 474 columns = "*" 475 476 self.fileview_query = ( 477 f"SELECT {columns} FROM {self.storageFileview} {where_clauses}" 478 ) 479 480 return 481 482 @staticmethod 483 @tracer.start_as_current_span("SynapseStorage::login") 484 def login( 485 synapse_cache_path: Optional[str] = None, 486 access_token: Optional[str] = None, 487 ) -> synapseclient.Synapse: 488 """Login to Synapse 489 490 Args: 491 access_token (Optional[str], optional): A synapse access token. Defaults to None. 492 synapse_cache_path (Optional[str]): location of synapse cache 493 494 Raises: 495 ValueError: If unable to loging with access token 496 497 Returns: 498 synapseclient.Synapse: A Synapse object that is logged in 499 """ 500 if not access_token: 501 access_token = os.getenv("SYNAPSE_ACCESS_TOKEN") 502 503 # login using a token 504 if access_token: 505 try: 506 syn = synapseclient.Synapse( 507 cache_root_dir=synapse_cache_path, 508 debug=False, 509 skip_checks=True, 510 cache_client=False, 511 ) 512 syn.login(authToken=access_token, silent=True) 513 except SynapseHTTPError as exc: 514 raise ValueError( 515 "No access to resources. Please make sure that your token is correct" 516 ) from exc 517 else: 518 # login using synapse credentials provided by user in .synapseConfig (default) file 519 syn = synapseclient.Synapse( 520 configPath=CONFIG.synapse_configuration_path, 521 cache_root_dir=synapse_cache_path, 522 debug=False, 523 skip_checks=True, 524 cache_client=False, 525 ) 526 syn.login(silent=True) 527 528 # set user id attribute 529 current_span = trace.get_current_span() 530 if current_span.is_recording(): 531 current_span.set_attribute("user.id", syn.credentials.owner_id) 532 533 return syn 534 535 def missing_entity_handler(method): 536 def wrapper(*args, **kwargs): 537 try: 538 return method(*args, **kwargs) 539 except SynapseHTTPError as ex: 540 str_message = str(ex).replace("\n", "") 541 if "trash" in str_message or "does not exist" in str_message: 542 logging.warning(str_message) 543 return None 544 else: 545 raise ex 546 547 return wrapper 548 549 def async_missing_entity_handler(method): 550 """Decorator to handle missing entities in async methods.""" 551 552 async def wrapper(*args: Any, **kwargs: Any) -> Any: 553 try: 554 return await method(*args, **kwargs) 555 except SynapseHTTPError as ex: 556 str_message = str(ex).replace("\n", "") 557 if "trash" in str_message or "does not exist" in str_message: 558 logging.warning(str_message) 559 return None 560 else: 561 raise ex 562 563 return wrapper 564 565 def getStorageFileviewTable(self): 566 """Returns the storageFileviewTable obtained during initialization.""" 567 return self.storageFileviewTable 568 569 def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]: 570 """Gets the paginated results of the REST call to Synapse to check what projects the current user has access to. 571 572 Args: 573 currentUserId: synapse id for the user whose projects we want to get. 574 575 Returns: 576 A dictionary with a next page token and the results. 577 """ 578 all_results = self.syn.restGET( 579 "/projects/user/{principalId}".format(principalId=currentUserId) 580 ) 581 582 while ( 583 "nextPageToken" in all_results 584 ): # iterate over next page token in results while there is any 585 results_token = self.syn.restGET( 586 "/projects/user/{principalId}?nextPageToken={nextPageToken}".format( 587 principalId=currentUserId, 588 nextPageToken=all_results["nextPageToken"], 589 ) 590 ) 591 all_results["results"].extend(results_token["results"]) 592 593 if "nextPageToken" in results_token: 594 all_results["nextPageToken"] = results_token["nextPageToken"] 595 else: 596 del all_results["nextPageToken"] 597 598 return all_results 599 600 @tracer.start_as_current_span("SynapseStorage::getStorageProjects") 601 def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]: 602 """Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute. 603 604 Returns: 605 A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName). 606 """ 607 608 # get the set of all storage Synapse project accessible for this pipeline 609 storageProjects = self.storageFileviewTable["projectId"].unique() 610 611 # get the set of storage Synapse project accessible for this user 612 # get a list of projects from Synapse 613 current_user_project_headers = self.synapse_entity_tracker.get_project_headers( 614 current_user_id=self.syn.credentials.owner_id, syn=self.syn 615 ) 616 project_id_to_name_dict = {} 617 current_user_projects = [] 618 for project_header in current_user_project_headers: 619 project_id_to_name_dict[project_header.get("id")] = project_header.get( 620 "name" 621 ) 622 current_user_projects.append(project_header.get("id")) 623 624 # find set of user projects that are also in this pipeline's storage projects set 625 storageProjects = list(set(storageProjects) & set(current_user_projects)) 626 627 # Limit projects to scope if specified 628 if project_scope: 629 storageProjects = list(set(storageProjects) & set(project_scope)) 630 631 if not storageProjects: 632 raise Warning( 633 f"There are no projects that the user has access to that match the criteria of the specified project scope: {project_scope}" 634 ) 635 636 # prepare a return list of project IDs and names 637 projects = [] 638 for projectId in storageProjects: 639 project_name_from_project_header = project_id_to_name_dict.get(projectId) 640 projects.append((projectId, project_name_from_project_header)) 641 642 sorted_projects_list = sorted(projects, key=lambda tup: tup[0]) 643 644 return sorted_projects_list 645 646 @tracer.start_as_current_span("SynapseStorage::getStorageDatasetsInProject") 647 def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]: 648 """Gets all datasets in folder under a given storage project that the current user has access to. 649 650 Args: 651 projectId: synapse ID of a storage project. 652 653 Returns: 654 A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName). 655 None: If the projectId cannot be found on Synapse. 656 """ 657 658 # select all folders and fetch their names from within the storage project; 659 # if folder content type is defined, only select folders that contain datasets 660 if "contentType" in self.storageFileviewTable.columns: 661 foldersTable = self.storageFileviewTable[ 662 (self.storageFileviewTable["contentType"] == "dataset") 663 & (self.storageFileviewTable["projectId"] == projectId) 664 ] 665 else: 666 foldersTable = self.storageFileviewTable[ 667 (self.storageFileviewTable["type"] == "folder") 668 & (self.storageFileviewTable["parentId"] == projectId) 669 ] 670 671 # get an array of tuples (folderId, folderName) 672 # some folders are part of datasets; others contain datasets 673 # each dataset parent is the project; folders part of a dataset have another folder as a parent 674 # to get folders if and only if they contain datasets for each folder 675 # check if folder's parent is the project; if so that folder contains a dataset, 676 # unless the folder list has already been filtered to dataset folders based on contentType attribute above 677 678 datasetList = [] 679 folderProperties = ["id", "name"] 680 for folder in list( 681 foldersTable[folderProperties].itertuples(index=False, name=None) 682 ): 683 datasetList.append(folder) 684 685 sorted_dataset_list = sorted(datasetList, key=lambda tup: tup[0]) 686 687 return sorted_dataset_list 688 689 @tracer.start_as_current_span("SynapseStorage::getFilesInStorageDataset") 690 def getFilesInStorageDataset( 691 self, datasetId: str, fileNames: List = None, fullpath: bool = True 692 ) -> List[Tuple[str, str]]: 693 """Gets all files (excluding manifest files) in a given dataset folder. 694 695 Args: 696 datasetId: synapse ID of a storage dataset. 697 fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g. 698 metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present. 699 fullpath: if True return the full path as part of this filename; otherwise return just base filename 700 701 Returns: 702 A list of files; the list consists of tuples (fileId, fileName). 703 704 Raises: 705 ValueError: Dataset ID not found. 706 """ 707 file_list = [] 708 709 # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view 710 if self.storageFileviewTable.empty: 711 raise ValueError( 712 f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again." 713 ) 714 715 child_path = self.storageFileviewTable.loc[ 716 self.storageFileviewTable["parentId"] == datasetId, "path" 717 ] 718 if child_path.empty: 719 raise LookupError( 720 f"Dataset {datasetId} could not be found in fileview {self.storageFileview}." 721 ) 722 child_path = child_path.iloc[0] 723 724 # Get the dataset path by eliminating the child's portion of the path to account for nested datasets 725 parent = child_path.split("/")[:-1] 726 parent = "/".join(parent) 727 728 # Format dataset path to be used in table query 729 dataset_path = f"'{parent}/%'" 730 731 # When querying, only include files to exclude entity files and subdirectories 732 where_clauses = [f"path like {dataset_path}", "type='file'"] 733 734 # Requery the fileview to specifically get the files in the given dataset 735 self.query_fileview(columns=["id", "path"], where_clauses=where_clauses) 736 737 # Exclude manifest files 738 non_manifest_files = self.storageFileviewTable.loc[ 739 ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"), 740 :, 741 ] 742 743 # Remove all files that are not in the list of fileNames 744 if fileNames: 745 filename_regex = "|".join(fileNames) 746 747 matching_files = non_manifest_files["path"].str.contains( 748 filename_regex, case=False, regex=True 749 ) 750 751 non_manifest_files = non_manifest_files.loc[matching_files, :] 752 753 # Truncate path if necessary 754 if not fullpath: 755 non_manifest_files.path = non_manifest_files.path.apply(os.path.basename) 756 757 # Return list of files as expected by other methods 758 file_list = list(non_manifest_files.itertuples(index=False, name=None)) 759 760 return file_list 761 762 def _get_manifest_id(self, manifest: pd.DataFrame) -> str: 763 """If both censored and uncensored manifests are present, return uncensored manifest; if only one manifest is present, return manifest id of that manifest; if more than two manifests are present, return the manifest id of the first one. 764 Args: 765 manifest: a dataframe contains name and id of manifests in a given asset view 766 767 Return: 768 manifest_syn_id: id of a given censored or uncensored manifest 769 """ 770 censored_regex = re.compile(".*censored.*") 771 censored = manifest["name"].str.contains(censored_regex) 772 if any(censored): 773 # Try to use uncensored manifest first 774 not_censored = ~censored 775 if any(not_censored): 776 manifest_syn_id = manifest[not_censored]["id"].iloc[0] 777 # if only censored manifests are available, just use the first censored manifest 778 else: 779 manifest_syn_id = manifest["id"].iloc[0] 780 781 # otherwise, use the first (implied only) version that exists 782 else: 783 manifest_syn_id = manifest["id"].iloc[0] 784 785 return manifest_syn_id 786 787 @tracer.start_as_current_span("SynapseStorage::getDatasetManifest") 788 def getDatasetManifest( 789 self, 790 datasetId: str, 791 downloadFile: bool = False, 792 newManifestName: str = "", 793 use_temporary_folder: bool = True, 794 ) -> Union[str, File]: 795 """Gets the manifest associated with a given dataset. 796 797 Args: 798 datasetId: synapse ID of a storage dataset. 799 downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not. 800 newManifestName: new name of a manifest that gets downloaded 801 use_temporary_folder: boolean argument indicating if a temporary folder 802 should be used to store the manifest file. This is useful when running 803 this code as an API server where multiple requests could be made at the 804 same time. This is set to False when the code is being used from the 805 CLI. Defaults to True. 806 807 Returns: 808 manifest_syn_id (String): Synapse ID of exisiting manifest file. 809 manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True. 810 "" (String): No pre-exisiting manifest in dataset. 811 """ 812 manifest_data = "" 813 814 # get a list of files containing the manifest for this dataset (if any) 815 all_files = self.storageFileviewTable 816 817 # construct regex based on manifest basename in the config 818 manifest_re = re.compile(os.path.basename(self.manifest) + ".*.[tc]sv") 819 820 # search manifest based on given manifest basename regex above 821 # and return a dataframe containing name and id of manifests in a given asset view 822 manifest = all_files[ 823 (all_files["name"].str.contains(manifest_re, regex=True)) 824 & (all_files["parentId"] == datasetId) 825 ] 826 827 manifest = manifest[["id", "name"]] 828 829 # if there is no pre-exisiting manifest in the specified dataset 830 if manifest.empty: 831 logger.warning( 832 f"Could not find a manifest that fits basename {self.manifest} in asset view and dataset {datasetId}" 833 ) 834 return "" 835 836 # if there is an exisiting manifest 837 else: 838 manifest_syn_id = self._get_manifest_id(manifest) 839 if downloadFile: 840 md = ManifestDownload( 841 self.syn, 842 manifest_id=manifest_syn_id, 843 synapse_entity_tracker=self.synapse_entity_tracker, 844 ) 845 manifest_data = md.download_manifest( 846 newManifestName=newManifestName, 847 manifest_df=manifest, 848 use_temporary_folder=use_temporary_folder, 849 ) 850 # TO DO: revisit how downstream code handle manifest_data. If the downstream code would break when manifest_data is an empty string, 851 # then we should catch the error here without returning an empty string. 852 if not manifest_data: 853 logger.debug( 854 f"No manifest data returned. Please check if you have successfully downloaded manifest: {manifest_syn_id}" 855 ) 856 return manifest_data 857 return manifest_syn_id 858 859 def getDataTypeFromManifest(self, manifestId: str): 860 """Fetch a manifest and return data types of all columns 861 Args: 862 manifestId: synapse ID of a manifest 863 """ 864 # get manifest file path 865 manifest_entity = self.synapse_entity_tracker.get( 866 synapse_id=manifestId, syn=self.syn, download_file=True 867 ) 868 manifest_filepath = manifest_entity.path 869 870 # load manifest dataframe 871 manifest = load_df( 872 manifest_filepath, 873 preserve_raw_input=False, 874 data_model=False, 875 ) 876 877 # convert the dataFrame to use best possible dtypes. 878 manifest_new = manifest.convert_dtypes() 879 880 # get data types of columns 881 result = manifest_new.dtypes.to_frame("dtypes").reset_index() 882 883 # return the result as a dictionary 884 result_dict = result.set_index("index")["dtypes"].astype(str).to_dict() 885 886 return result_dict 887 888 def _get_files_metadata_from_dataset( 889 self, datasetId: str, only_new_files: bool, manifest: pd.DataFrame = None 890 ) -> Optional[dict]: 891 """retrieve file ids under a particular datasetId 892 893 Args: 894 datasetId (str): a dataset id 895 only_new_files (bool): if only adding new files that are not already exist 896 manifest (pd.DataFrame): metadata manifest dataframe. Default to None. 897 898 Returns: 899 a dictionary that contains filename and entityid under a given datasetId or None if there is nothing under a given dataset id are not available 900 """ 901 dataset_files = self.getFilesInStorageDataset(datasetId) 902 if dataset_files: 903 dataset_file_names_id_dict = self._get_file_entityIds( 904 dataset_files, only_new_files=only_new_files, manifest=manifest 905 ) 906 return dataset_file_names_id_dict 907 else: 908 return None 909 910 def add_entity_id_and_filename( 911 self, datasetId: str, manifest: pd.DataFrame 912 ) -> pd.DataFrame: 913 """add entityid and filename column to an existing manifest assuming entityId column is not already present 914 915 Args: 916 datasetId (str): dataset syn id 917 manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty 918 919 Returns: 920 pd.DataFrame: returns a pandas dataframe 921 """ 922 # get file names and entity ids of a given dataset 923 dataset_files_dict = self._get_files_metadata_from_dataset( 924 datasetId, only_new_files=False 925 ) 926 927 if dataset_files_dict: 928 # turn manifest dataframe back to a dictionary for operation 929 manifest_dict = manifest.to_dict("list") 930 931 # update Filename column 932 # add entityId column to the end 933 manifest_dict.update(dataset_files_dict) 934 935 # if the component column exists in existing manifest, fill up that column 936 if "Component" in manifest_dict.keys(): 937 manifest_dict["Component"] = manifest_dict["Component"] * max( 938 1, len(manifest_dict["Filename"]) 939 ) 940 941 # turn dictionary back to a dataframe 942 manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient="index") 943 manifest_df_updated = manifest_df_index.transpose() 944 945 # fill na with empty string 946 manifest_df_updated = manifest_df_updated.fillna("") 947 948 # drop index 949 manifest_df_updated = manifest_df_updated.reset_index(drop=True) 950 951 return manifest_df_updated 952 else: 953 return manifest 954 955 def fill_in_entity_id_filename( 956 self, datasetId: str, manifest: pd.DataFrame 957 ) -> Tuple[List, pd.DataFrame]: 958 """fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present. 959 960 Args: 961 datasetId (str): dataset syn id 962 manifest (pd.DataFrame): existing manifest dataframe. 963 964 Returns: 965 Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe 966 """ 967 # get dataset file names and entity id as a list of tuple 968 dataset_files = self.getFilesInStorageDataset(datasetId) 969 970 # update manifest with additional filenames, if any 971 # note that if there is an existing manifest and there are files in the dataset 972 # the columns Filename and entityId are assumed to be present in manifest schema 973 # TODO: use idiomatic panda syntax 974 if not dataset_files: 975 manifest = manifest.fillna("") 976 return dataset_files, manifest 977 978 all_files = self._get_file_entityIds( 979 dataset_files=dataset_files, only_new_files=False, manifest=manifest 980 ) 981 new_files = self._get_file_entityIds( 982 dataset_files=dataset_files, only_new_files=True, manifest=manifest 983 ) 984 985 all_files = pd.DataFrame(all_files) 986 new_files = pd.DataFrame(new_files) 987 988 # update manifest so that it contains new dataset files 989 manifest = ( 990 pd.concat([manifest, new_files], sort=False) 991 .reset_index() 992 .drop("index", axis=1) 993 ) 994 995 # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata 996 manifest_reindex = manifest.set_index("entityId") 997 all_files_reindex = all_files.set_index("entityId") 998 all_files_reindex_like_manifest = all_files_reindex.reindex_like( 999 manifest_reindex 1000 ) 1001 1002 # Check if individual file paths in manifest and from synapse match 1003 file_paths_match = ( 1004 manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"] 1005 ) 1006 1007 # If all the paths do not match, update the manifest with the filepaths from synapse 1008 if not file_paths_match.all(): 1009 manifest_reindex.loc[ 1010 ~file_paths_match, "Filename" 1011 ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"] 1012 1013 # reformat manifest for further use 1014 manifest = manifest_reindex.reset_index() 1015 entityIdCol = manifest.pop("entityId") 1016 manifest.insert(len(manifest.columns), "entityId", entityIdCol) 1017 1018 manifest = manifest.fillna("") 1019 return dataset_files, manifest 1020 1021 @tracer.start_as_current_span("SynapseStorage::updateDatasetManifestFiles") 1022 def updateDatasetManifestFiles( 1023 self, dmge: DataModelGraphExplorer, datasetId: str, store: bool = True 1024 ) -> Union[Tuple[str, pd.DataFrame], None]: 1025 """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any. 1026 1027 Args: 1028 dmge: DataModelGraphExplorer Instance 1029 datasetId: synapse ID of a storage dataset. 1030 store: if set to True store updated manifest in asset store; if set to False 1031 return a Pandas dataframe containing updated manifest but do not store to asset store 1032 1033 1034 Returns: 1035 Synapse ID of updated manifest and Pandas dataframe containing the updated manifest. 1036 If there is no existing manifest or if the manifest does not have an entityId column, return None 1037 """ 1038 1039 # get existing manifest Synapse ID 1040 manifest_id = self.getDatasetManifest(datasetId) 1041 1042 # if there is no manifest return None 1043 if not manifest_id: 1044 return None 1045 1046 manifest_entity = self.synapse_entity_tracker.get( 1047 synapse_id=manifest_id, syn=self.syn, download_file=True 1048 ) 1049 manifest_filepath = manifest_entity.path 1050 manifest = load_df(manifest_filepath) 1051 1052 # If the manifest does not have an entityId column, trigger a new manifest to be generated 1053 if "entityId" not in manifest.columns: 1054 return None 1055 1056 manifest_is_file_based = "Filename" in manifest.columns 1057 1058 if manifest_is_file_based: 1059 # update manifest with additional filenames, if any 1060 # note that if there is an existing manifest and there are files in the dataset 1061 # the columns Filename and entityId are assumed to be present in manifest schema 1062 # TODO: use idiomatic panda syntax 1063 dataset_files, manifest = self.fill_in_entity_id_filename( 1064 datasetId, manifest 1065 ) 1066 if dataset_files: 1067 # update the manifest file, so that it contains the relevant entity IDs 1068 if store: 1069 manifest.to_csv(manifest_filepath, index=False) 1070 1071 # store manifest and update associated metadata with manifest on Synapse 1072 manifest_id = self.associateMetadataWithFiles( 1073 dmge, manifest_filepath, datasetId 1074 ) 1075 1076 return manifest_id, manifest 1077 1078 def _get_file_entityIds( 1079 self, 1080 dataset_files: List, 1081 only_new_files: bool = False, 1082 manifest: pd.DataFrame = None, 1083 ): 1084 """ 1085 Get a dictionary of files in a dataset. Either files that are not in the current manifest or all files 1086 1087 Args: 1088 manifest: metadata manifest 1089 dataset_file: List of all files in a dataset 1090 only_new_files: boolean to control whether only new files are returned or all files in the dataset 1091 Returns: 1092 files: dictionary of file names and entityIDs, with scope as specified by `only_new_files` 1093 """ 1094 files = {"Filename": [], "entityId": []} 1095 1096 if only_new_files: 1097 if manifest is None: 1098 raise UnboundLocalError( 1099 "No manifest was passed in, a manifest is required when `only_new_files` is True." 1100 ) 1101 1102 if "entityId" not in manifest.columns: 1103 raise ValueError( 1104 "The manifest in your dataset and/or top level folder must contain the 'entityId' column. " 1105 "Please generate an empty manifest without annotations, manually add annotations to the " 1106 "appropriate files in the manifest, and then try again." 1107 ) 1108 1109 # find new files (that are not in the current manifest) if any 1110 for file_id, file_name in dataset_files: 1111 if not file_id in manifest["entityId"].values: 1112 files["Filename"].append(file_name) 1113 files["entityId"].append(file_id) 1114 else: 1115 # get all files 1116 for file_id, file_name in dataset_files: 1117 files["Filename"].append(file_name) 1118 files["entityId"].append(file_id) 1119 1120 return files 1121 1122 @tracer.start_as_current_span("SynapseStorage::getProjectManifests") 1123 def getProjectManifests( 1124 self, projectId: str 1125 ) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]: 1126 """Gets all metadata manifest files across all datasets in a specified project. 1127 1128 Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest 1129 as a list of tuples, one for each manifest: 1130 [ 1131 ( 1132 (datasetId, dataName), 1133 (manifestId, manifestName), 1134 (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema 1135 ), 1136 ... 1137 ] 1138 1139 TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface 1140 """ 1141 component = None 1142 entity = None 1143 manifests = [] 1144 1145 datasets = self.getStorageDatasetsInProject(projectId) 1146 1147 for datasetId, datasetName in datasets: 1148 # encode information about the manifest in a simple list (so that R clients can unpack it) 1149 # eventually can serialize differently 1150 1151 # Get synID of manifest for a dataset 1152 manifestId = self.getDatasetManifest(datasetId) 1153 1154 # If a manifest exists, get the annotations for it, else return base 'manifest' tuple 1155 if manifestId: 1156 annotations = self.getFileAnnotations(manifestId) 1157 1158 # If manifest has annotations specifying component, use that 1159 if annotations and "Component" in annotations: 1160 component = annotations["Component"] 1161 entity = self.synapse_entity_tracker.get( 1162 synapse_id=manifestId, syn=self.syn, download_file=False 1163 ) 1164 manifest_name = entity["properties"]["name"] 1165 1166 # otherwise download the manifest and parse for information 1167 elif not annotations or "Component" not in annotations: 1168 logging.debug( 1169 f"No component annotations have been found for manifest {manifestId}. " 1170 "The manifest will be downloaded and parsed instead. " 1171 "For increased speed, add component annotations to manifest." 1172 ) 1173 1174 manifest_info = self.getDatasetManifest( 1175 datasetId, downloadFile=True 1176 ) 1177 manifest_name = manifest_info["properties"].get("name", "") 1178 1179 if not manifest_name: 1180 logger.error(f"Failed to download manifests from {datasetId}") 1181 1182 manifest_path = manifest_info["path"] 1183 1184 manifest_df = load_df(manifest_path) 1185 1186 # Get component from component column if it exists 1187 if ( 1188 "Component" in manifest_df 1189 and not manifest_df["Component"].empty 1190 ): 1191 list(set(manifest_df["Component"])) 1192 component = list(set(manifest_df["Component"])) 1193 1194 # Added to address issues raised during DCA testing 1195 if "" in component: 1196 component.remove("") 1197 1198 if len(component) == 1: 1199 component = component[0] 1200 elif len(component) > 1: 1201 logging.warning( 1202 f"Manifest {manifestId} is composed of multiple components. Schematic does not support mulit-component manifests at this time." 1203 "Behavior of manifests with multiple components is undefined" 1204 ) 1205 else: 1206 manifest_name = "" 1207 component = None 1208 if component: 1209 manifest = ( 1210 (datasetId, datasetName), 1211 (manifestId, manifest_name), 1212 (component, component), 1213 ) 1214 elif manifestId: 1215 logging.debug( 1216 f"Manifest {manifestId} does not have an associated Component" 1217 ) 1218 manifest = ( 1219 (datasetId, datasetName), 1220 (manifestId, manifest_name), 1221 ("", ""), 1222 ) 1223 else: 1224 manifest = ( 1225 (datasetId, datasetName), 1226 ("", ""), 1227 ("", ""), 1228 ) 1229 1230 if manifest: 1231 manifests.append(manifest) 1232 1233 return manifests 1234 1235 def upload_project_manifests_to_synapse( 1236 self, dmge: DataModelGraphExplorer, projectId: str 1237 ) -> List[str]: 1238 """Upload all metadata manifest files across all datasets in a specified project as tables in Synapse. 1239 1240 Returns: String of all the manifest_table_ids of all the manifests that have been loaded. 1241 """ 1242 1243 manifests = [] 1244 manifest_loaded = [] 1245 datasets = self.getStorageDatasetsInProject(projectId) 1246 1247 for datasetId, datasetName in datasets: 1248 # encode information about the manifest in a simple list (so that R clients can unpack it) 1249 # eventually can serialize differently 1250 1251 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1252 1253 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1254 if manifest_info: 1255 manifest_id = manifest_info["properties"]["id"] 1256 manifest_name = manifest_info["properties"]["name"] 1257 manifest_path = manifest_info["path"] 1258 manifest_df = load_df(manifest_path) 1259 manifest_table_id = uploadDB( 1260 dmge=dmge, 1261 manifest=manifest, 1262 datasetId=datasetId, 1263 table_name=datasetName, 1264 ) 1265 manifest_loaded.append(datasetName) 1266 return manifest_loaded 1267 1268 def upload_annotated_project_manifests_to_synapse( 1269 self, projectId: str, path_to_json_ld: str, dry_run: bool = False 1270 ) -> List[str]: 1271 """ 1272 Purpose: 1273 For all manifests in a project, upload them as a table and add annotations manifest csv. 1274 Assumes the manifest is already present as a CSV in a dataset in the project. 1275 1276 """ 1277 # Instantiate DataModelParser 1278 data_model_parser = DataModelParser(path_to_data_model=path_to_json_ld) 1279 # Parse Model 1280 parsed_data_model = data_model_parser.parse_model() 1281 1282 # Instantiate DataModelGraph 1283 data_model_grapher = DataModelGraph(parsed_data_model) 1284 1285 # Generate graph 1286 graph_data_model = data_model_grapher.generate_data_model_graph() 1287 1288 # Instantiate DataModelGraphExplorer 1289 dmge = DataModelGraphExplorer(graph_data_model) 1290 1291 manifests = [] 1292 manifest_loaded = [] 1293 datasets = self.getStorageDatasetsInProject(projectId) 1294 for datasetId, datasetName in datasets: 1295 # encode information about the manifest in a simple list (so that R clients can unpack it) 1296 # eventually can serialize differently 1297 1298 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1299 manifests.append(manifest) 1300 1301 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1302 1303 if manifest_info: 1304 manifest_id = manifest_info["properties"]["id"] 1305 manifest_name = manifest_info["properties"]["name"] 1306 manifest_path = manifest_info["path"] 1307 manifest = ( 1308 (datasetId, datasetName), 1309 (manifest_id, manifest_name), 1310 ("", ""), 1311 ) 1312 if not dry_run: 1313 self.associateMetadataWithFiles( 1314 dmge, manifest_path, datasetId, manifest_record_type="table" 1315 ) 1316 manifest_loaded.append(manifest) 1317 1318 return manifests, manifest_loaded 1319 1320 def move_entities_to_new_project( 1321 self, 1322 projectId: str, 1323 newProjectId: str, 1324 returnEntities: bool = False, 1325 dry_run: bool = False, 1326 ): 1327 """ 1328 For each manifest csv in a project, look for all the entitiy ids that are associated. 1329 Look up the entitiy in the files, move the entity to new project. 1330 """ 1331 1332 manifests = [] 1333 manifest_loaded = [] 1334 datasets = self.getStorageDatasetsInProject(projectId) 1335 if datasets: 1336 for datasetId, datasetName in datasets: 1337 # encode information about the manifest in a simple list (so that R clients can unpack it) 1338 # eventually can serialize differently 1339 1340 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1341 manifests.append(manifest) 1342 1343 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1344 if manifest_info: 1345 manifest_id = manifest_info["properties"]["id"] 1346 manifest_name = manifest_info["properties"]["name"] 1347 manifest_path = manifest_info["path"] 1348 manifest_df = load_df(manifest_path) 1349 1350 manifest = ( 1351 (datasetId, datasetName), 1352 (manifest_id, manifest_name), 1353 ("", ""), 1354 ) 1355 manifest_loaded.append(manifest) 1356 1357 annotation_entities = self.storageFileviewTable[ 1358 (self.storageFileviewTable["id"].isin(manifest_df["entityId"])) 1359 & (self.storageFileviewTable["type"] == "folder") 1360 ]["id"] 1361 1362 if returnEntities: 1363 for entityId in annotation_entities: 1364 if not dry_run: 1365 moved_entity = self.syn.move(entityId, datasetId) 1366 self.synapse_entity_tracker.add( 1367 synapse_id=moved_entity.id, entity=moved_entity 1368 ) 1369 else: 1370 logging.info( 1371 f"{entityId} will be moved to folder {datasetId}." 1372 ) 1373 else: 1374 # generate project folder 1375 archive_project_folder = Folder( 1376 projectId + "_archive", parent=newProjectId 1377 ) 1378 archive_project_folder = self.syn.store(archive_project_folder) 1379 self.synapse_entity_tracker.add( 1380 synapse_id=archive_project_folder.id, 1381 entity=archive_project_folder, 1382 ) 1383 1384 # generate dataset folder 1385 dataset_archive_folder = Folder( 1386 "_".join([datasetId, datasetName, "archive"]), 1387 parent=archive_project_folder.id, 1388 ) 1389 dataset_archive_folder = self.syn.store(dataset_archive_folder) 1390 self.synapse_entity_tracker.add( 1391 synapse_id=dataset_archive_folder.id, 1392 entity=dataset_archive_folder, 1393 ) 1394 1395 for entityId in annotation_entities: 1396 # move entities to folder 1397 if not dry_run: 1398 moved_entity = self.syn.move( 1399 entityId, dataset_archive_folder.id 1400 ) 1401 self.synapse_entity_tracker.add( 1402 synapse_id=moved_entity.id, entity=moved_entity 1403 ) 1404 else: 1405 logging.info( 1406 f"{entityId} will be moved to folder {dataset_archive_folder.id}." 1407 ) 1408 else: 1409 raise LookupError( 1410 f"No datasets were found in the specified project: {projectId}. Re-check specified master_fileview in CONFIG and retry." 1411 ) 1412 return manifests, manifest_loaded 1413 1414 @tracer.start_as_current_span("SynapseStorage::get_synapse_table") 1415 def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable]: 1416 """Download synapse table as a pd dataframe; return table schema and etags as results too 1417 1418 Args: 1419 synapse_id: synapse ID of the table to query 1420 """ 1421 1422 results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id)) 1423 df = results.asDataFrame( 1424 rowIdAndVersionInIndex=False, 1425 na_values=STR_NA_VALUES_FILTERED, 1426 keep_default_na=False, 1427 ) 1428 1429 return df, results 1430 1431 @missing_entity_handler 1432 @tracer.start_as_current_span("SynapseStorage::uploadDB") 1433 def uploadDB( 1434 self, 1435 dmge: DataModelGraphExplorer, 1436 manifest: pd.DataFrame, 1437 datasetId: str, 1438 table_name: str, 1439 restrict: bool = False, 1440 table_manipulation: str = "replace", 1441 table_column_names: str = "class_label", 1442 ): 1443 """ 1444 Method to upload a database to an asset store. In synapse, this will upload a metadata table 1445 1446 Args: 1447 dmge: DataModelGraphExplorer object 1448 manifest: pd.Df manifest to upload 1449 datasetId: synID of the dataset for the manifest 1450 table_name: name of the table to be uploaded 1451 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 1452 existingTableId: str of the synId of the existing table, if one already exists 1453 table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert) 1454 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 1455 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 1456 display label formatting. 1457 Returns: 1458 manifest_table_id: synID of the uploaded table 1459 manifest: the original manifset 1460 table_manifest: manifest formatted appropriately for the table 1461 1462 """ 1463 1464 col_schema, table_manifest = self.formatDB( 1465 dmge=dmge, manifest=manifest, table_column_names=table_column_names 1466 ) 1467 1468 manifest_table_id = self.buildDB( 1469 datasetId, 1470 table_name, 1471 col_schema, 1472 table_manifest, 1473 table_manipulation, 1474 dmge, 1475 restrict, 1476 ) 1477 1478 return manifest_table_id, manifest, table_manifest 1479 1480 @tracer.start_as_current_span("SynapseStorage::formatDB") 1481 def formatDB(self, dmge, manifest, table_column_names): 1482 """ 1483 Method to format a manifest appropriatly for upload as table 1484 1485 Args: 1486 dmge: DataModelGraphExplorer object 1487 manifest: pd.Df manifest to upload 1488 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 1489 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 1490 display label formatting. 1491 Returns: 1492 col_schema: schema for table columns: type, size, etc 1493 table_manifest: formatted manifest 1494 1495 """ 1496 # Rename the manifest columns to display names to match fileview 1497 1498 blacklist_chars = ["(", ")", ".", " ", "-"] 1499 manifest_columns = manifest.columns.tolist() 1500 1501 table_manifest = deepcopy(manifest) 1502 1503 if table_column_names == "display_name": 1504 cols = table_manifest.columns 1505 1506 elif table_column_names == "display_label": 1507 cols = [ 1508 str(col).translate({ord(x): "" for x in blacklist_chars}) 1509 for col in manifest_columns 1510 ] 1511 1512 elif table_column_names == "class_label": 1513 cols = [ 1514 get_class_label_from_display_name(str(col)).translate( 1515 {ord(x): "" for x in blacklist_chars} 1516 ) 1517 for col in manifest_columns 1518 ] 1519 else: 1520 ValueError( 1521 f"The provided table_column_name: {table_column_names} is not valid, please resubmit with an allowed value only." 1522 ) 1523 1524 cols = list(map(lambda x: x.replace("EntityId", "entityId"), cols)) 1525 1526 # Reset column names in table manifest 1527 table_manifest.columns = cols 1528 1529 # move entity id to end of df 1530 entity_col = table_manifest.pop("entityId") 1531 table_manifest.insert(len(table_manifest.columns), "entityId", entity_col) 1532 1533 # Get the column schema 1534 col_schema = as_table_columns(table_manifest) 1535 1536 # Set Id column length to 64 (for some reason not being auto set.) 1537 for i, col in enumerate(col_schema): 1538 if col["name"].lower() == "id": 1539 col_schema[i]["maximumSize"] = 64 1540 1541 return col_schema, table_manifest 1542 1543 @tracer.start_as_current_span("SynapseStorage::buildDB") 1544 def buildDB( 1545 self, 1546 datasetId: str, 1547 table_name: str, 1548 col_schema: List, 1549 table_manifest: pd.DataFrame, 1550 table_manipulation: str, 1551 dmge: DataModelGraphExplorer, 1552 restrict: bool = False, 1553 ): 1554 """ 1555 Method to construct the table appropriately: create new table, replace existing, or upsert new into existing 1556 Calls TableOperations class to execute 1557 1558 Args: 1559 datasetId: synID of the dataset for the manifest 1560 table_name: name of the table to be uploaded 1561 col_schema: schema for table columns: type, size, etc from `formatDB` 1562 table_manifest: formatted manifest that can be uploaded as a table 1563 table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert) 1564 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 1565 1566 Returns: 1567 manifest_table_id: synID of the uploaded table 1568 1569 """ 1570 table_parent_id = self.getDatasetProject(datasetId=datasetId) 1571 existing_table_id = self.syn.findEntityId( 1572 name=table_name, parent=table_parent_id 1573 ) 1574 1575 tableOps = TableOperations( 1576 synStore=self, 1577 tableToLoad=table_manifest, 1578 tableName=table_name, 1579 datasetId=datasetId, 1580 existingTableId=existing_table_id, 1581 restrict=restrict, 1582 synapse_entity_tracker=self.synapse_entity_tracker, 1583 ) 1584 1585 if not table_manipulation or existing_table_id is None: 1586 manifest_table_id = tableOps.createTable( 1587 columnTypeDict=col_schema, 1588 specifySchema=True, 1589 ) 1590 elif existing_table_id is not None: 1591 if table_manipulation.lower() == "replace": 1592 manifest_table_id = tableOps.replaceTable( 1593 specifySchema=True, 1594 columnTypeDict=col_schema, 1595 ) 1596 elif table_manipulation.lower() == "upsert": 1597 manifest_table_id = tableOps.upsertTable( 1598 dmge=dmge, 1599 ) 1600 elif table_manipulation.lower() == "update": 1601 manifest_table_id = tableOps.updateTable() 1602 1603 if table_manipulation and table_manipulation.lower() == "upsert": 1604 table_entity = self.synapse_entity_tracker.get( 1605 synapse_id=existing_table_id or manifest_table_id, 1606 syn=self.syn, 1607 download_file=False, 1608 ) 1609 annos = OldAnnotations( 1610 id=table_entity.id, 1611 etag=table_entity.etag, 1612 values=table_entity.annotations, 1613 ) 1614 annos["primary_key"] = table_manifest["Component"][0] + "_id" 1615 annos = self.syn.set_annotations(annos) 1616 table_entity.etag = annos.etag 1617 table_entity.annotations = annos 1618 1619 return manifest_table_id 1620 1621 @tracer.start_as_current_span("SynapseStorage::upload_manifest_file") 1622 def upload_manifest_file( 1623 self, 1624 manifest, 1625 metadataManifestPath, 1626 datasetId, 1627 restrict_manifest, 1628 component_name="", 1629 ): 1630 # Update manifest to have the new entityId column 1631 manifest.to_csv(metadataManifestPath, index=False) 1632 1633 # store manifest to Synapse as a CSV 1634 # update file name 1635 file_name_full = metadataManifestPath.split("/")[-1] 1636 file_extension = file_name_full.split(".")[-1] 1637 1638 # Differentiate "censored" and "uncensored" manifest 1639 if "censored" in file_name_full: 1640 file_name_new = ( 1641 os.path.basename(CONFIG.synapse_manifest_basename) 1642 + "_" 1643 + component_name 1644 + "_censored" 1645 + "." 1646 + file_extension 1647 ) 1648 else: 1649 file_name_new = ( 1650 os.path.basename(CONFIG.synapse_manifest_basename) 1651 + "_" 1652 + component_name 1653 + "." 1654 + file_extension 1655 ) 1656 1657 manifest_synapse_file = None 1658 try: 1659 # Rename the file to file_name_new then revert 1660 # This is to maintain the original file name in-case other code is 1661 # expecting that the file exists with the original name 1662 original_file_path = metadataManifestPath 1663 new_file_path = os.path.join( 1664 os.path.dirname(metadataManifestPath), file_name_new 1665 ) 1666 os.rename(original_file_path, new_file_path) 1667 1668 manifest_synapse_file = self._store_file_for_manifest_upload( 1669 new_file_path=new_file_path, 1670 dataset_id=datasetId, 1671 existing_file_name=file_name_full, 1672 file_name_new=file_name_new, 1673 restrict_manifest=restrict_manifest, 1674 ) 1675 manifest_synapse_file_id = manifest_synapse_file.id 1676 1677 finally: 1678 # Revert the file name back to the original 1679 os.rename(new_file_path, original_file_path) 1680 1681 if manifest_synapse_file: 1682 manifest_synapse_file.path = original_file_path 1683 1684 return manifest_synapse_file_id 1685 1686 def _store_file_for_manifest_upload( 1687 self, 1688 new_file_path: str, 1689 dataset_id: str, 1690 existing_file_name: str, 1691 file_name_new: str, 1692 restrict_manifest: bool, 1693 ) -> File: 1694 """Handles a create or update of a manifest file that is going to be uploaded. 1695 If we already have a copy of the Entity in memory we will update that instance, 1696 otherwise create a new File instance to be created in Synapse. Once stored 1697 this will add the file to the `synapse_entity_tracker` for future reference. 1698 1699 Args: 1700 new_file_path (str): The path to the new manifest file 1701 dataset_id (str): The Synapse ID of the dataset the manifest is associated with 1702 existing_file_name (str): The name of the existing file 1703 file_name_new (str): The name of the new file 1704 restrict_manifest (bool): Whether the manifest should be restricted 1705 1706 Returns: 1707 File: The stored manifest file 1708 """ 1709 local_tracked_file_instance = ( 1710 self.synapse_entity_tracker.search_local_by_parent_and_name( 1711 name=existing_file_name, parent_id=dataset_id 1712 ) 1713 or self.synapse_entity_tracker.search_local_by_parent_and_name( 1714 name=file_name_new, parent_id=dataset_id 1715 ) 1716 ) 1717 1718 if local_tracked_file_instance: 1719 local_tracked_file_instance.path = new_file_path 1720 local_tracked_file_instance.description = ( 1721 "Manifest for dataset " + dataset_id 1722 ) 1723 manifest_synapse_file = local_tracked_file_instance 1724 else: 1725 manifest_synapse_file = File( 1726 path=new_file_path, 1727 description="Manifest for dataset " + dataset_id, 1728 parent=dataset_id, 1729 name=file_name_new, 1730 ) 1731 1732 manifest_synapse_file = self.syn.store( 1733 manifest_synapse_file, isRestricted=restrict_manifest 1734 ) 1735 1736 self.synapse_entity_tracker.add( 1737 synapse_id=manifest_synapse_file.id, entity=manifest_synapse_file 1738 ) 1739 return manifest_synapse_file 1740 1741 async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]: 1742 """get annotations asynchronously 1743 1744 Args: 1745 synapse_id (str): synapse id of the entity that the annotation belongs 1746 1747 Returns: 1748 Dict[str, Any]: The requested entity bundle matching 1749 <https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html> 1750 """ 1751 return await get_entity_id_bundle2( 1752 entity_id=synapse_id, 1753 request={"includeAnnotations": True}, 1754 synapse_client=self.syn, 1755 ) 1756 1757 async def store_async_annotation(self, annotation_dict: dict) -> Annotations: 1758 """store annotation in an async way 1759 1760 Args: 1761 annotation_dict (dict): annotation in a dictionary format 1762 1763 Returns: 1764 Annotations: The stored annotations. 1765 """ 1766 annotation_data = Annotations.from_dict( 1767 synapse_annotations=annotation_dict["annotations"]["annotations"] 1768 ) 1769 annotation_class = Annotations( 1770 annotations=annotation_data, 1771 etag=annotation_dict["annotations"]["etag"], 1772 id=annotation_dict["annotations"]["id"], 1773 ) 1774 annotation_storage_result = await annotation_class.store_async( 1775 synapse_client=self.syn 1776 ) 1777 local_entity = self.synapse_entity_tracker.get( 1778 synapse_id=annotation_dict["annotations"]["id"], 1779 syn=self.syn, 1780 download_file=False, 1781 retrieve_if_not_present=False, 1782 ) 1783 if local_entity: 1784 local_entity.etag = annotation_storage_result.etag 1785 local_entity.annotations = annotation_storage_result 1786 return annotation_storage_result 1787 1788 def process_row_annotations( 1789 self, 1790 dmge: DataModelGraphExplorer, 1791 metadata_syn: Dict[str, Any], 1792 hide_blanks: bool, 1793 csv_list_regex: str, 1794 annos: Dict[str, Any], 1795 annotation_keys: str, 1796 ) -> Dict[str, Any]: 1797 """Processes metadata annotations based on the logic below: 1798 1. Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is: 1799 An empty or whitespace-only string. 1800 A NaN value (if the annotation is a float). 1801 if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key. 1802 if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key. 1803 1804 2. If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name". 1805 Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key. 1806 1807 3. For any other conditions, assigns the original value of anno_v to the annotation key (anno_k). 1808 1809 4. Returns the updated annotations dictionary. 1810 1811 Args: 1812 dmge (DataModelGraphExplorer): data model graph explorer 1813 metadata_syn (dict): metadata used for Synapse storage 1814 hideBlanks (bool): if true, does not upload annotation keys with blank values. 1815 csv_list_regex (str): Regex to match with comma separated list 1816 annos (Dict[str, Any]): dictionary of annotation returned from synapse 1817 annotation_keys (str): display_label/class_label 1818 1819 Returns: 1820 Dict[str, Any]: annotations as a dictionary 1821 1822 ```mermaid 1823 flowchart TD 1824 A[Start] --> C{Is anno_v empty, whitespace, or NaN?} 1825 C -- Yes --> D{Is hide_blanks True?} 1826 D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing] 1827 D -- No --> F[Assign empty string to annotation key] 1828 C -- No --> G{Is anno_v a string?} 1829 G -- No --> H[Assign original value of anno_v to annotation key] 1830 G -- Yes --> I{Does anno_v match csv_list_regex?} 1831 I -- Yes --> J[Get validation rule of anno_k] 1832 J --> K{Does the validation rule contain 'list'} 1833 K -- Yes --> L[Split anno_v by commas and assign as list] 1834 I -- No --> H 1835 K -- No --> H 1836 ``` 1837 """ 1838 for anno_k, anno_v in metadata_syn.items(): 1839 # Remove keys with nan or empty string values or string that only contains white space from dict of annotations to be uploaded 1840 # if present on current data annotation 1841 if hide_blanks and ( 1842 (isinstance(anno_v, str) and anno_v.strip() == "") 1843 or (isinstance(anno_v, float) and np.isnan(anno_v)) 1844 ): 1845 annos["annotations"]["annotations"].pop(anno_k) if anno_k in annos[ 1846 "annotations" 1847 ]["annotations"].keys() else annos["annotations"]["annotations"] 1848 continue 1849 1850 # Otherwise save annotation as approrpriate 1851 if isinstance(anno_v, float) and np.isnan(anno_v): 1852 annos["annotations"]["annotations"][anno_k] = "" 1853 continue 1854 1855 # Handle strings that match the csv_list_regex and pass the validation rule 1856 if isinstance(anno_v, str) and re.fullmatch(csv_list_regex, anno_v): 1857 # Use a dictionary to dynamically choose the argument 1858 param = ( 1859 {"node_display_name": anno_k} 1860 if annotation_keys == "display_label" 1861 else {"node_label": anno_k} 1862 ) 1863 node_validation_rules = dmge.get_node_validation_rules(**param) 1864 1865 if rule_in_rule_list("list", node_validation_rules): 1866 annos["annotations"]["annotations"][anno_k] = anno_v.split(",") 1867 continue 1868 # default: assign the original value 1869 annos["annotations"]["annotations"][anno_k] = anno_v 1870 1871 return annos 1872 1873 @async_missing_entity_handler 1874 async def format_row_annotations( 1875 self, 1876 dmge: DataModelGraphExplorer, 1877 row: pd.Series, 1878 entityId: str, 1879 hideBlanks: bool, 1880 annotation_keys: str, 1881 ) -> Union[None, Dict[str, Any]]: 1882 """Format row annotations 1883 1884 Args: 1885 dmge (DataModelGraphExplorer): data moodel graph explorer object 1886 row (pd.Series): row of the manifest 1887 entityId (str): entity id of the manifest 1888 hideBlanks (bool): when true, does not upload annotation keys with blank values. When false, upload Annotation keys with empty string values 1889 annotation_keys (str): display_label/class_label 1890 1891 Returns: 1892 Union[None, Dict[str,]]: if entity id is in trash can, return None. Otherwise, return the annotations 1893 """ 1894 # prepare metadata for Synapse storage (resolve display name into a name that Synapse annotations support (e.g no spaces, parenthesis) 1895 # note: the removal of special characters, will apply only to annotation keys; we are not altering the manifest 1896 # this could create a divergence between manifest column and annotations. this should be ok for most use cases. 1897 # columns with special characters are outside of the schema 1898 metadataSyn = {} 1899 blacklist_chars = ["(", ")", ".", " ", "-"] 1900 1901 for k, v in row.to_dict().items(): 1902 if annotation_keys == "display_label": 1903 keySyn = str(k).translate({ord(x): "" for x in blacklist_chars}) 1904 elif annotation_keys == "class_label": 1905 keySyn = get_class_label_from_display_name(str(k)).translate( 1906 {ord(x): "" for x in blacklist_chars} 1907 ) 1908 1909 # Skip `Filename` and `ETag` columns when setting annotations 1910 if keySyn in ["Filename", "ETag", "eTag"]: 1911 continue 1912 1913 # truncate annotation values to 500 characters if the 1914 # size of values is greater than equal to 500 characters 1915 # add an explicit [truncatedByDataCuratorApp] message at the end 1916 # of every truncated message to indicate that the cell value 1917 # has been truncated 1918 if isinstance(v, str) and len(v) >= 500: 1919 v = v[0:472] + "[truncatedByDataCuratorApp]" 1920 1921 metadataSyn[keySyn] = v 1922 1923 # This will first check if the entity is already in memory, and if so, that 1924 # instance is used. Unfortunately, the expected return format needs to match 1925 # the Synapse API, so we need to convert the annotations to the expected format. 1926 entity = self.synapse_entity_tracker.get( 1927 synapse_id=entityId, 1928 syn=self.syn, 1929 download_file=False, 1930 retrieve_if_not_present=False, 1931 ) 1932 if entity is not None: 1933 synapse_annotations = _convert_to_annotations_list( 1934 annotations=entity.annotations 1935 ) 1936 annos = { 1937 "annotations": { 1938 "id": entity.id, 1939 "etag": entity.etag, 1940 "annotations": synapse_annotations, 1941 } 1942 } 1943 else: 1944 annos = await self.get_async_annotation(entityId) 1945 1946 # set annotation(s) for the various objects/items in a dataset on Synapse 1947 csv_list_regex = comma_separated_list_regex() 1948 1949 annos = self.process_row_annotations( 1950 dmge=dmge, 1951 metadata_syn=metadataSyn, 1952 hide_blanks=hideBlanks, 1953 csv_list_regex=csv_list_regex, 1954 annos=annos, 1955 annotation_keys=annotation_keys, 1956 ) 1957 1958 return annos 1959 1960 @missing_entity_handler 1961 @tracer.start_as_current_span("SynapseStorage::format_manifest_annotations") 1962 def format_manifest_annotations(self, manifest, manifest_synapse_id): 1963 """ 1964 Set annotations for the manifest (as a whole) so they can be applied to the manifest table or csv. 1965 For now just getting the Component. 1966 """ 1967 1968 entity = self.synapse_entity_tracker.get( 1969 synapse_id=manifest_synapse_id, syn=self.syn, download_file=False 1970 ) 1971 is_file = entity.concreteType.endswith(".FileEntity") 1972 is_table = entity.concreteType.endswith(".TableEntity") 1973 1974 if is_file: 1975 # Get file metadata 1976 metadata = self.getFileAnnotations(manifest_synapse_id) 1977 1978 # If there is a defined component add it to the metadata. 1979 if "Component" in manifest.columns: 1980 # Gather component information 1981 component = manifest["Component"].unique() 1982 1983 # Double check that only a single component is listed, else raise an error. 1984 try: 1985 len(component) == 1 1986 except ValueError as err: 1987 raise ValueError( 1988 f"Manifest has more than one component. Please check manifest and resubmit." 1989 ) from err 1990 1991 # Add component to metadata 1992 metadata["Component"] = component[0] 1993 1994 elif is_table: 1995 # Get table metadata 1996 metadata = self.getTableAnnotations(manifest_synapse_id) 1997 1998 # Get annotations 1999 annos = OldAnnotations( 2000 id=entity.id, etag=entity.etag, values=entity.annotations 2001 ) 2002 2003 # Add metadata to the annotations 2004 for annos_k, annos_v in metadata.items(): 2005 annos[annos_k] = annos_v 2006 2007 return annos 2008 2009 ''' 2010 def annotate_upload_manifest_table(self, manifest, datasetId, metadataManifestPath, 2011 useSchemaLabel: bool = True, hideBlanks: bool = False, restrict_manifest = False): 2012 """ 2013 Purpose: 2014 Works very similarly to associateMetadataWithFiles except takes in the manifest 2015 rather than the manifest path 2016 2017 """ 2018 2019 # Add uuid for table updates and fill. 2020 if not "Uuid" in manifest.columns: 2021 manifest["Uuid"] = '' 2022 2023 for idx,row in manifest.iterrows(): 2024 if not row["Uuid"]: 2025 gen_uuid = uuid.uuid4() 2026 row["Uuid"] = gen_uuid 2027 manifest.loc[idx, 'Uuid'] = gen_uuid 2028 2029 # add entityId as a column if not already there or 2030 # fill any blanks with an empty string. 2031 if not "entityId" in manifest.columns: 2032 manifest["entityId"] = "" 2033 else: 2034 manifest["entityId"].fillna("", inplace=True) 2035 2036 # get a DataModelGraphExplorer object to ensure schema attribute names used in manifest are translated to schema labels for synapse annotations 2037 dmge = DataModelGraphExplorer() 2038 2039 # Create table name here. 2040 if 'Component' in manifest.columns: 2041 table_name = manifest['Component'][0].lower() + '_synapse_storage_manifest_table' 2042 else: 2043 table_name = 'synapse_storage_manifest_table' 2044 2045 # Upload manifest as a table and get the SynID and manifest 2046 manifest_synapse_table_id, manifest, table_manifest = self.upload_format_manifest_table( 2047 dmge, manifest, datasetId, table_name, restrict = restrict_manifest, useSchemaLabel=useSchemaLabel,) 2048 2049 # Iterate over manifest rows, create Synapse entities and store corresponding entity IDs in manifest if needed 2050 # also set metadata for each synapse entity as Synapse annotations 2051 for idx, row in manifest.iterrows(): 2052 if not row["entityId"]: 2053 # If not using entityIds, fill with manifest_table_id so 2054 row["entityId"] = manifest_synapse_table_id 2055 entityId = '' 2056 else: 2057 # get the entity id corresponding to this row 2058 entityId = row["entityId"] 2059 2060 # Load manifest to synapse as a CSV File 2061 manifest_synapse_file_id = self.upload_manifest_file(manifest, metadataManifestPath, datasetId, restrict_manifest) 2062 2063 # Get annotations for the file manifest. 2064 manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_file_id) 2065 2066 self.syn.set_annotations(manifest_annotations) 2067 2068 logger.info("Associated manifest file with dataset on Synapse.") 2069 2070 # Update manifest Synapse table with new entity id column. 2071 self.make_synapse_table( 2072 table_to_load = table_manifest, 2073 dataset_id = datasetId, 2074 existingTableId = manifest_synapse_table_id, 2075 table_name = table_name, 2076 update_col = 'Uuid', 2077 specify_schema = False, 2078 ) 2079 2080 # Get annotations for the table manifest 2081 manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_table_id) 2082 self.syn.set_annotations(manifest_annotations) 2083 return manifest_synapse_table_id 2084 ''' 2085 2086 def _read_manifest(self, metadataManifestPath: str) -> pd.DataFrame: 2087 """Helper function to read in provided manifest as a pandas DataFrame for subsequent downstream processing. 2088 Args: 2089 metadataManifestPath (str): path where manifest is stored 2090 Returns: 2091 manifest(pd.DataFrame): Manifest loaded as a pandas dataframe 2092 Raises: 2093 FileNotFoundError: Manifest file does not exist at provided path. 2094 """ 2095 # read new manifest csv 2096 try: 2097 load_args = { 2098 "dtype": "string", 2099 } 2100 manifest = load_df( 2101 metadataManifestPath, 2102 preserve_raw_input=False, 2103 allow_na_values=False, 2104 **load_args, 2105 ) 2106 except FileNotFoundError as err: 2107 raise FileNotFoundError( 2108 f"No manifest file was found at this path: {metadataManifestPath}" 2109 ) from err 2110 return manifest 2111 2112 def _add_id_columns_to_manifest( 2113 self, manifest: pd.DataFrame, dmge: DataModelGraphExplorer 2114 ): 2115 """Helper function to add id and entityId columns to the manifest if they do not already exist, Fill id values per row. 2116 Args: 2117 Manifest loaded as a pd.Dataframe 2118 Returns (pd.DataFrame): 2119 Manifest df with new Id and EntityId columns (and UUID values) if they were not already present. 2120 """ 2121 2122 # Add Id for table updates and fill. 2123 if not col_in_dataframe("Id", manifest): 2124 # See if schema has `Uuid` column specified 2125 try: 2126 uuid_col_in_schema = dmge.is_class_in_schema( 2127 "Uuid" 2128 ) or dmge.is_class_in_schema("uuid") 2129 except KeyError: 2130 uuid_col_in_schema = False 2131 2132 # Rename `Uuid` column if it wasn't specified in the schema 2133 if col_in_dataframe("Uuid", manifest) and not uuid_col_in_schema: 2134 manifest.rename(columns={"Uuid": "Id"}, inplace=True) 2135 # If no `Uuid` column exists or it is specified in the schema, create a new `Id` column 2136 else: 2137 manifest["Id"] = "" 2138 2139 # Retrieve the ID column name (id, Id and ID) are treated the same. 2140 id_col_name = [col for col in manifest.columns if col.lower() == "id"][0] 2141 2142 # Check if values have been added to the Id coulumn, if not add a UUID so value in the row is not blank. 2143 for idx, row in manifest.iterrows(): 2144 if not row[id_col_name]: 2145 gen_uuid = str(uuid.uuid4()) 2146 row[id_col_name] = gen_uuid 2147 manifest.loc[idx, id_col_name] = gen_uuid 2148 2149 # add entityId as a column if not already there or 2150 # fill any blanks with an empty string. 2151 if not col_in_dataframe("entityId", manifest): 2152 manifest["entityId"] = "" 2153 else: 2154 manifest["entityId"].fillna("", inplace=True) 2155 2156 return manifest 2157 2158 def _generate_table_name(self, manifest): 2159 """Helper function to generate a table name for upload to synapse. 2160 2161 Args: 2162 Manifest loaded as a pd.Dataframe 2163 2164 Returns: 2165 table_name (str): Name of the table to load 2166 component_name (str): Name of the manifest component (if applicable) 2167 """ 2168 # Create table name here. 2169 if "Component" in manifest.columns: 2170 component_name = manifest["Component"][0].lower() 2171 table_name = component_name + "_synapse_storage_manifest_table" 2172 else: 2173 component_name = "" 2174 table_name = "synapse_storage_manifest_table" 2175 return table_name, component_name 2176 2177 def _create_entity_id(self, idx, row, manifest, datasetId): 2178 """Helper function to generate an entityId and add it to the appropriate row in the manifest. 2179 Args: 2180 row: current row of manifest being processed 2181 manifest (pd.DataFrame): loaded df containing user supplied data. 2182 datasetId (str): synapse ID of folder containing the dataset 2183 2184 Returns: 2185 manifest (pd.DataFrame): manifest with entityId added to the appropriate row 2186 entityId (str): Generated Entity Id. 2187 2188 """ 2189 rowEntity = Folder(str(uuid.uuid4()), parent=datasetId) 2190 rowEntity = self.syn.store(rowEntity) 2191 entityId = rowEntity["id"] 2192 self.synapse_entity_tracker.add(synapse_id=entityId, entity=rowEntity) 2193 row["entityId"] = entityId 2194 manifest.loc[idx, "entityId"] = entityId 2195 return manifest, entityId 2196 2197 async def _process_store_annos(self, requests: Set[asyncio.Task]) -> None: 2198 """Process annotations and store them on synapse asynchronously 2199 2200 Args: 2201 requests (Set[asyncio.Task]): a set of tasks of formatting annotations created by format_row_annotations function in previous step 2202 2203 Raises: 2204 RuntimeError: raise a run time error if a task failed to complete 2205 """ 2206 while requests: 2207 done_tasks, pending_tasks = await asyncio.wait( 2208 requests, return_when=asyncio.FIRST_COMPLETED 2209 ) 2210 requests = pending_tasks 2211 2212 for completed_task in done_tasks: 2213 try: 2214 annos = completed_task.result() 2215 2216 if isinstance(annos, Annotations): 2217 logger.info(f"Successfully stored annotations for {annos.id}") 2218 else: 2219 # store annotations if they are not None 2220 if annos: 2221 entity_id = annos["annotations"]["id"] 2222 logger.info( 2223 f"Obtained and processed annotations for {entity_id} entity" 2224 ) 2225 requests.add( 2226 asyncio.create_task( 2227 self.store_async_annotation(annotation_dict=annos) 2228 ) 2229 ) 2230 except Exception as e: 2231 raise RuntimeError(f"failed with { repr(e) }.") from e 2232 2233 @tracer.start_as_current_span("SynapseStorage::add_annotations_to_entities_files") 2234 async def add_annotations_to_entities_files( 2235 self, 2236 dmge, 2237 manifest, 2238 manifest_record_type: str, 2239 datasetId: str, 2240 hideBlanks: bool, 2241 manifest_synapse_table_id="", 2242 annotation_keys: str = "class_label", 2243 ): 2244 """ 2245 Depending on upload type add Ids to entityId row. Add anotations to connected 2246 files and folders. Despite the name of this function, it also applies to folders. 2247 2248 Args: 2249 dmge: DataModelGraphExplorer Object 2250 manifest (pd.DataFrame): loaded df containing user supplied data. 2251 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2252 datasetId (str): synapse ID of folder containing the dataset 2253 hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2254 manifest_synapse_table_id (str): Default is an empty string ''. 2255 annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display 2256 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2257 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2258 Returns: 2259 manifest (pd.DataFrame): modified to add entitiyId as appropriate 2260 2261 """ 2262 2263 # Expected behavior is to annotate files if `Filename` is present and if file_annotations_upload is set to True regardless of `-mrt` setting 2264 if "filename" in [col.lower() for col in manifest.columns]: 2265 # get current list of files and store as dataframe 2266 dataset_files = self.getFilesInStorageDataset(datasetId) 2267 files_and_entityIds = self._get_file_entityIds( 2268 dataset_files=dataset_files, only_new_files=False 2269 ) 2270 file_df = pd.DataFrame(files_and_entityIds) 2271 2272 # Merge dataframes to add entityIds 2273 manifest = manifest.merge( 2274 file_df, how="left", on="Filename", suffixes=["_x", None] 2275 ).drop("entityId_x", axis=1) 2276 2277 # Fill `entityId` for each row if missing and annotate entity as appropriate 2278 requests = set() 2279 for idx, row in manifest.iterrows(): 2280 if not row["entityId"] and ( 2281 manifest_record_type == "file_and_entities" 2282 or manifest_record_type == "table_file_and_entities" 2283 ): 2284 manifest, entityId = self._create_entity_id( 2285 idx, row, manifest, datasetId 2286 ) 2287 elif not row["entityId"] and manifest_record_type == "table_and_file": 2288 # If not using entityIds, fill with manifest_table_id so 2289 row["entityId"] = manifest_synapse_table_id 2290 manifest.loc[idx, "entityId"] = manifest_synapse_table_id 2291 entityId = "" 2292 # If the row is the manifest table, do not add annotations 2293 elif row["entityId"] == manifest_synapse_table_id: 2294 entityId = "" 2295 else: 2296 # get the file id of the file to annotate, collected in above step. 2297 entityId = row["entityId"] 2298 2299 # Adding annotations to connected files. 2300 if entityId: 2301 # Format annotations for Synapse 2302 annos_task = asyncio.create_task( 2303 self.format_row_annotations( 2304 dmge, row, entityId, hideBlanks, annotation_keys 2305 ) 2306 ) 2307 requests.add(annos_task) 2308 await self._process_store_annos(requests) 2309 return manifest 2310 2311 @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_table") 2312 def upload_manifest_as_table( 2313 self, 2314 dmge: DataModelGraphExplorer, 2315 manifest: pd.DataFrame, 2316 metadataManifestPath: str, 2317 datasetId: str, 2318 table_name: str, 2319 component_name: str, 2320 restrict: bool, 2321 manifest_record_type: str, 2322 hideBlanks: bool, 2323 table_manipulation: str, 2324 table_column_names: str, 2325 annotation_keys: str, 2326 file_annotations_upload: bool = True, 2327 ): 2328 """Upload manifest to Synapse as a table and csv. 2329 Args: 2330 dmge: DataModelGraphExplorer object 2331 manifest (pd.DataFrame): loaded df containing user supplied data. 2332 metadataManifestPath: path to csv containing a validated metadata manifest. 2333 datasetId (str): synapse ID of folder containing the dataset 2334 table_name (str): Generated to name the table being uploaded. 2335 component_name (str): Name of the component manifest that is currently being uploaded. 2336 restrict (bool): Flag for censored data. 2337 manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2338 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2339 table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2340 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2341 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2342 display label formatting. 2343 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2344 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2345 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2346 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2347 Return: 2348 manifest_synapse_file_id: SynID of manifest csv uploaded to synapse. 2349 """ 2350 # Upload manifest as a table, get the ID and updated manifest. 2351 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2352 dmge=dmge, 2353 manifest=manifest, 2354 datasetId=datasetId, 2355 table_name=table_name, 2356 restrict=restrict, 2357 table_manipulation=table_manipulation, 2358 table_column_names=table_column_names, 2359 ) 2360 2361 if file_annotations_upload: 2362 manifest = asyncio.run( 2363 self.add_annotations_to_entities_files( 2364 dmge, 2365 manifest, 2366 manifest_record_type, 2367 datasetId, 2368 hideBlanks, 2369 manifest_synapse_table_id, 2370 annotation_keys, 2371 ) 2372 ) 2373 # Load manifest to synapse as a CSV File 2374 manifest_synapse_file_id = self.upload_manifest_file( 2375 manifest=manifest, 2376 metadataManifestPath=metadataManifestPath, 2377 datasetId=datasetId, 2378 restrict_manifest=restrict, 2379 component_name=component_name, 2380 ) 2381 2382 # Set annotations for the file manifest. 2383 manifest_annotations = self.format_manifest_annotations( 2384 manifest=manifest, manifest_synapse_id=manifest_synapse_file_id 2385 ) 2386 annos = self.syn.set_annotations(annotations=manifest_annotations) 2387 manifest_entity = self.synapse_entity_tracker.get( 2388 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2389 ) 2390 manifest_entity.annotations = annos 2391 manifest_entity.etag = annos.etag 2392 2393 logger.info("Associated manifest file with dataset on Synapse.") 2394 2395 # Update manifest Synapse table with new entity id column. 2396 manifest_synapse_table_id, manifest, _ = self.uploadDB( 2397 dmge=dmge, 2398 manifest=manifest, 2399 datasetId=datasetId, 2400 table_name=table_name, 2401 restrict=restrict, 2402 table_manipulation="update", 2403 table_column_names=table_column_names, 2404 ) 2405 2406 # Set annotations for the table manifest 2407 manifest_annotations = self.format_manifest_annotations( 2408 manifest=manifest, manifest_synapse_id=manifest_synapse_table_id 2409 ) 2410 annotations_manifest_table = self.syn.set_annotations( 2411 annotations=manifest_annotations 2412 ) 2413 manifest_table_entity = self.synapse_entity_tracker.get( 2414 synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False 2415 ) 2416 manifest_table_entity.annotations = annotations_manifest_table 2417 manifest_table_entity.etag = annotations_manifest_table.etag 2418 2419 return manifest_synapse_file_id 2420 2421 @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_csv") 2422 def upload_manifest_as_csv( 2423 self, 2424 dmge, 2425 manifest, 2426 metadataManifestPath, 2427 datasetId, 2428 restrict, 2429 manifest_record_type, 2430 hideBlanks, 2431 component_name, 2432 annotation_keys: str, 2433 file_annotations_upload: bool = True, 2434 ): 2435 """Upload manifest to Synapse as a csv only. 2436 Args: 2437 dmge: DataModelGraphExplorer object 2438 manifest (pd.DataFrame): loaded df containing user supplied data. 2439 metadataManifestPath: path to csv containing a validated metadata manifest. 2440 datasetId (str): synapse ID of folder containing the dataset 2441 restrict (bool): Flag for censored data. 2442 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2443 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2444 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2445 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2446 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2447 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2448 Return: 2449 manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. 2450 """ 2451 if file_annotations_upload: 2452 manifest = asyncio.run( 2453 self.add_annotations_to_entities_files( 2454 dmge, 2455 manifest, 2456 manifest_record_type, 2457 datasetId, 2458 hideBlanks, 2459 annotation_keys=annotation_keys, 2460 ) 2461 ) 2462 2463 # Load manifest to synapse as a CSV File 2464 manifest_synapse_file_id = self.upload_manifest_file( 2465 manifest, 2466 metadataManifestPath, 2467 datasetId, 2468 restrict, 2469 component_name=component_name, 2470 ) 2471 2472 # Set annotations for the file manifest. 2473 manifest_annotations = self.format_manifest_annotations( 2474 manifest, manifest_synapse_file_id 2475 ) 2476 annos = self.syn.set_annotations(manifest_annotations) 2477 manifest_entity = self.synapse_entity_tracker.get( 2478 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2479 ) 2480 manifest_entity.annotations = annos 2481 manifest_entity.etag = annos.etag 2482 2483 logger.info("Associated manifest file with dataset on Synapse.") 2484 2485 return manifest_synapse_file_id 2486 2487 @tracer.start_as_current_span("SynapseStorage::upload_manifest_combo") 2488 def upload_manifest_combo( 2489 self, 2490 dmge, 2491 manifest, 2492 metadataManifestPath, 2493 datasetId, 2494 table_name, 2495 component_name, 2496 restrict, 2497 manifest_record_type, 2498 hideBlanks, 2499 table_manipulation, 2500 table_column_names: str, 2501 annotation_keys: str, 2502 file_annotations_upload: bool = True, 2503 ): 2504 """Upload manifest to Synapse as a table and CSV with entities. 2505 Args: 2506 dmge: DataModelGraphExplorer object 2507 manifest (pd.DataFrame): loaded df containing user supplied data. 2508 metadataManifestPath: path to csv containing a validated metadata manifest. 2509 datasetId (str): synapse ID of folder containing the dataset 2510 table_name (str): Generated to name the table being uploaded. 2511 component_name (str): Name of the component manifest that is currently being uploaded. 2512 restrict (bool): Flag for censored data. 2513 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2514 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2515 table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2516 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2517 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2518 display label formatting. 2519 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2520 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2521 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2522 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2523 Return: 2524 manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. 2525 """ 2526 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2527 dmge=dmge, 2528 manifest=manifest, 2529 datasetId=datasetId, 2530 table_name=table_name, 2531 restrict=restrict, 2532 table_manipulation=table_manipulation, 2533 table_column_names=table_column_names, 2534 ) 2535 2536 if file_annotations_upload: 2537 manifest = asyncio.run( 2538 self.add_annotations_to_entities_files( 2539 dmge, 2540 manifest, 2541 manifest_record_type, 2542 datasetId, 2543 hideBlanks, 2544 manifest_synapse_table_id, 2545 annotation_keys=annotation_keys, 2546 ) 2547 ) 2548 2549 # Load manifest to synapse as a CSV File 2550 manifest_synapse_file_id = self.upload_manifest_file( 2551 manifest, metadataManifestPath, datasetId, restrict, component_name 2552 ) 2553 2554 # Set annotations for the file manifest. 2555 manifest_annotations = self.format_manifest_annotations( 2556 manifest, manifest_synapse_file_id 2557 ) 2558 file_manifest_annoations = self.syn.set_annotations(manifest_annotations) 2559 manifest_entity = self.synapse_entity_tracker.get( 2560 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2561 ) 2562 manifest_entity.annotations = file_manifest_annoations 2563 manifest_entity.etag = file_manifest_annoations.etag 2564 logger.info("Associated manifest file with dataset on Synapse.") 2565 2566 # Update manifest Synapse table with new entity id column. 2567 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2568 dmge=dmge, 2569 manifest=manifest, 2570 datasetId=datasetId, 2571 table_name=table_name, 2572 restrict=restrict, 2573 table_manipulation="update", 2574 table_column_names=table_column_names, 2575 ) 2576 2577 # Set annotations for the table manifest 2578 manifest_annotations = self.format_manifest_annotations( 2579 manifest, manifest_synapse_table_id 2580 ) 2581 table_manifest_annotations = self.syn.set_annotations(manifest_annotations) 2582 manifest_entity = self.synapse_entity_tracker.get( 2583 synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False 2584 ) 2585 manifest_entity.annotations = table_manifest_annotations 2586 manifest_entity.etag = table_manifest_annotations.etag 2587 return manifest_synapse_file_id 2588 2589 @tracer.start_as_current_span("SynapseStorage::associateMetadataWithFiles") 2590 def associateMetadataWithFiles( 2591 self, 2592 dmge: DataModelGraphExplorer, 2593 metadataManifestPath: str, 2594 datasetId: str, 2595 manifest_record_type: str = "table_file_and_entities", 2596 hideBlanks: bool = False, 2597 restrict_manifest=False, 2598 table_manipulation: str = "replace", 2599 table_column_names: str = "class_label", 2600 annotation_keys: str = "class_label", 2601 file_annotations_upload: bool = True, 2602 ) -> str: 2603 """Associate metadata with files in a storage dataset already on Synapse. 2604 Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file. 2605 2606 If this is a new manifest there could be no Synapse entities associated with the rows of this manifest 2607 this may be due to data type (e.g. clinical data) being tabular 2608 and not requiring files; to utilize uniform interfaces downstream 2609 (i.e. fileviews), a Synapse entity (a folder) is created for each row 2610 and an entity column is added to the manifest containing the resulting 2611 entity IDs; a table is also created at present as an additional interface 2612 for downstream query and interaction with the data. 2613 2614 Args: 2615 dmge: DataModelGraphExplorer Object 2616 metadataManifestPath: path to csv containing a validated metadata manifest. 2617 The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type. 2618 Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item. 2619 In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file. 2620 datasetId: synapse ID of folder containing the dataset 2621 manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination. 2622 hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2623 restrict_manifest (bool): Default is false. Flag for censored data. 2624 table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2625 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2626 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2627 display label formatting. 2628 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2629 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2630 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2631 Returns: 2632 manifest_synapse_file_id: SynID of manifest csv uploaded to synapse. 2633 """ 2634 # Read new manifest CSV: 2635 manifest = self._read_manifest(metadataManifestPath) 2636 manifest = self._add_id_columns_to_manifest(manifest, dmge) 2637 2638 table_name, component_name = self._generate_table_name(manifest) 2639 2640 # Upload manifest to synapse based on user input (manifest_record_type) 2641 if manifest_record_type == "file_only": 2642 manifest_synapse_file_id = self.upload_manifest_as_csv( 2643 dmge=dmge, 2644 manifest=manifest, 2645 metadataManifestPath=metadataManifestPath, 2646 datasetId=datasetId, 2647 restrict=restrict_manifest, 2648 hideBlanks=hideBlanks, 2649 manifest_record_type=manifest_record_type, 2650 component_name=component_name, 2651 annotation_keys=annotation_keys, 2652 file_annotations_upload=file_annotations_upload, 2653 ) 2654 elif manifest_record_type == "table_and_file": 2655 manifest_synapse_file_id = self.upload_manifest_as_table( 2656 dmge=dmge, 2657 manifest=manifest, 2658 metadataManifestPath=metadataManifestPath, 2659 datasetId=datasetId, 2660 table_name=table_name, 2661 component_name=component_name, 2662 restrict=restrict_manifest, 2663 hideBlanks=hideBlanks, 2664 manifest_record_type=manifest_record_type, 2665 table_manipulation=table_manipulation, 2666 table_column_names=table_column_names, 2667 annotation_keys=annotation_keys, 2668 file_annotations_upload=file_annotations_upload, 2669 ) 2670 elif manifest_record_type == "file_and_entities": 2671 manifest_synapse_file_id = self.upload_manifest_as_csv( 2672 dmge=dmge, 2673 manifest=manifest, 2674 metadataManifestPath=metadataManifestPath, 2675 datasetId=datasetId, 2676 restrict=restrict_manifest, 2677 hideBlanks=hideBlanks, 2678 manifest_record_type=manifest_record_type, 2679 component_name=component_name, 2680 annotation_keys=annotation_keys, 2681 file_annotations_upload=file_annotations_upload, 2682 ) 2683 elif manifest_record_type == "table_file_and_entities": 2684 manifest_synapse_file_id = self.upload_manifest_combo( 2685 dmge=dmge, 2686 manifest=manifest, 2687 metadataManifestPath=metadataManifestPath, 2688 datasetId=datasetId, 2689 table_name=table_name, 2690 component_name=component_name, 2691 restrict=restrict_manifest, 2692 hideBlanks=hideBlanks, 2693 manifest_record_type=manifest_record_type, 2694 table_manipulation=table_manipulation, 2695 table_column_names=table_column_names, 2696 annotation_keys=annotation_keys, 2697 file_annotations_upload=file_annotations_upload, 2698 ) 2699 else: 2700 raise ValueError("Please enter a valid manifest_record_type.") 2701 return manifest_synapse_file_id 2702 2703 def getTableAnnotations(self, table_id: str): 2704 """Generate dictionary of annotations for the given Synapse file. 2705 Synapse returns all custom annotations as lists since they 2706 can contain multiple values. In all cases, the values will 2707 be converted into strings and concatenated with ", ". 2708 2709 Args: 2710 fileId (str): Synapse ID for dataset file. 2711 2712 Returns: 2713 dict: Annotations as comma-separated strings. 2714 """ 2715 try: 2716 entity = self.synapse_entity_tracker.get( 2717 synapse_id=table_id, syn=self.syn, download_file=False 2718 ) 2719 is_table = entity.concreteType.endswith(".TableEntity") 2720 annotations_raw = entity.annotations 2721 except SynapseHTTPError: 2722 # If an error occurs with retrieving entity, skip it 2723 # This could be caused by a temporary file view that 2724 # was deleted since its ID was retrieved 2725 is_file, is_table = False, False 2726 2727 # Skip anything that isn't a file or folder 2728 if not (is_table): 2729 return None 2730 2731 annotations = self.getEntityAnnotations(table_id, entity, annotations_raw) 2732 2733 return annotations 2734 2735 def getFileAnnotations(self, fileId: str) -> Dict[str, str]: 2736 """Generate dictionary of annotations for the given Synapse file. 2737 Synapse returns all custom annotations as lists since they 2738 can contain multiple values. In all cases, the values will 2739 be converted into strings and concatenated with ", ". 2740 2741 Args: 2742 fileId (str): Synapse ID for dataset file. 2743 2744 Returns: 2745 dict: Annotations as comma-separated strings. 2746 """ 2747 2748 # Get entity metadata, including annotations 2749 try: 2750 entity = self.synapse_entity_tracker.get( 2751 synapse_id=fileId, syn=self.syn, download_file=False 2752 ) 2753 is_file = entity.concreteType.endswith(".FileEntity") 2754 is_folder = entity.concreteType.endswith(".Folder") 2755 annotations_raw = entity.annotations 2756 except SynapseHTTPError: 2757 # If an error occurs with retrieving entity, skip it 2758 # This could be caused by a temporary file view that 2759 # was deleted since its ID was retrieved 2760 is_file, is_folder = False, False 2761 2762 # Skip anything that isn't a file or folder 2763 if not (is_file or is_folder): 2764 return None 2765 2766 annotations = self.getEntityAnnotations(fileId, entity, annotations_raw) 2767 2768 return annotations 2769 2770 def getEntityAnnotations(self, fileId, entity, annotations_raw): 2771 # Extract annotations from their lists and stringify. For example: 2772 # {'YearofBirth': [1980], 'author': ['bruno', 'milen', 'sujay']} 2773 annotations = dict() 2774 for key, vals in annotations_raw.items(): 2775 if isinstance(vals, list) and len(vals) == 1: 2776 annotations[key] = str(vals[0]) 2777 else: 2778 annotations[key] = ", ".join(str(v) for v in vals) 2779 2780 # Add the file entity ID and eTag, which weren't lists 2781 assert fileId == entity.id, ( 2782 "For some reason, the Synapse ID in the response doesn't match" 2783 "the Synapse ID sent in the request (via synapseclient)." 2784 ) 2785 annotations["entityId"] = fileId 2786 annotations["eTag"] = entity.etag 2787 2788 return annotations 2789 2790 def getDatasetAnnotations( 2791 self, datasetId: str, fill_na: bool = True, force_batch: bool = False 2792 ) -> pd.DataFrame: 2793 """Generate table for annotations across all files in given dataset. 2794 2795 Args: 2796 datasetId (str): Synapse ID for dataset folder. 2797 fill_na (bool): Whether to replace missing values with 2798 blank strings. 2799 force_batch (bool): Whether to force the function to use 2800 the batch mode, which uses a file view to retrieve 2801 annotations for a given dataset. Default to False 2802 unless there are more than 50 files in the dataset. 2803 2804 Returns: 2805 pd.DataFrame: Table of annotations. 2806 """ 2807 # Get all files in given dataset 2808 dataset_files = self.getFilesInStorageDataset(datasetId) 2809 2810 # if there are no dataset files, there are no annotations 2811 # return None 2812 if not dataset_files: 2813 return pd.DataFrame() 2814 2815 dataset_files_map = dict(dataset_files) 2816 dataset_file_ids, _ = list(zip(*dataset_files)) 2817 2818 # Get annotations for each file from Step 1 2819 # Batch mode 2820 try_batch = len(dataset_files) >= 50 or force_batch 2821 if try_batch: 2822 try: 2823 logger.info("Trying batch mode for retrieving Synapse annotations") 2824 table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids) 2825 except (SynapseAuthenticationError, SynapseHTTPError, ValueError): 2826 logger.info( 2827 f"Unable to create a temporary file view bound to {datasetId}. " 2828 "Defaulting to slower iterative retrieval of annotations." 2829 ) 2830 # Default to the slower non-batch method 2831 logger.info("Batch mode failed (probably due to permission error)") 2832 try_batch = False 2833 2834 # Non-batch mode 2835 if not try_batch: 2836 logger.info("Using slower (non-batch) sequential mode") 2837 records = [self.getFileAnnotations(i) for i in dataset_file_ids] 2838 # Remove any annotations for non-file/folders (stored as None) 2839 records = filter(None, records) 2840 table = pd.DataFrame.from_records(records) 2841 2842 # Add filenames for the files that "survived" annotation retrieval 2843 filenames = [dataset_files_map[i] for i in table["entityId"]] 2844 2845 if "Filename" not in table.columns: 2846 table.insert(0, "Filename", filenames) 2847 2848 # Ensure that entityId and eTag are at the end 2849 entity_ids = table.pop("entityId") 2850 etags = table.pop("eTag") 2851 table.insert(len(table.columns), "entityId", entity_ids) 2852 table.insert(len(table.columns), "eTag", etags) 2853 2854 # Missing values are filled in with empty strings for Google Sheets 2855 if fill_na: 2856 table.fillna("", inplace=True) 2857 2858 # Force all values as strings 2859 return table.astype(str) 2860 2861 def raise_final_error(retry_state): 2862 return retry_state.outcome.result() 2863 2864 def checkIfinAssetView(self, syn_id) -> str: 2865 # get data in administrative fileview for this pipeline 2866 assetViewTable = self.getStorageFileviewTable() 2867 all_files = list(assetViewTable["id"]) 2868 if syn_id in all_files: 2869 return True 2870 else: 2871 return False 2872 2873 @tracer.start_as_current_span("SynapseStorage::getDatasetProject") 2874 @retry( 2875 stop=stop_after_attempt(5), 2876 wait=wait_chain( 2877 *[wait_fixed(10) for i in range(2)] 2878 + [wait_fixed(15) for i in range(2)] 2879 + [wait_fixed(20)] 2880 ), 2881 retry=retry_if_exception_type(LookupError), 2882 retry_error_callback=raise_final_error, 2883 ) 2884 def getDatasetProject(self, datasetId: str) -> str: 2885 """Get parent project for a given dataset ID. 2886 2887 Args: 2888 datasetId (str): Synapse entity ID (folder or project). 2889 2890 Raises: 2891 ValueError: Raised if Synapse ID cannot be retrieved 2892 by the user or if it doesn't appear in the file view. 2893 2894 Returns: 2895 str: The Synapse ID for the parent project. 2896 """ 2897 2898 # Subset main file view 2899 dataset_index = self.storageFileviewTable["id"] == datasetId 2900 dataset_row = self.storageFileviewTable[dataset_index] 2901 2902 # re-query if no datasets found 2903 if dataset_row.empty: 2904 sleep(5) 2905 self.query_fileview(force_requery=True) 2906 # Subset main file view 2907 dataset_index = self.storageFileviewTable["id"] == datasetId 2908 dataset_row = self.storageFileviewTable[dataset_index] 2909 2910 # Return `projectId` for given row if only one found 2911 if len(dataset_row) == 1: 2912 dataset_project = dataset_row["projectId"].values[0] 2913 return dataset_project 2914 2915 # Otherwise, check if already project itself 2916 try: 2917 syn_object = self.synapse_entity_tracker.get( 2918 synapse_id=datasetId, syn=self.syn, download_file=False 2919 ) 2920 if syn_object.properties["concreteType"].endswith("Project"): 2921 return datasetId 2922 except SynapseHTTPError: 2923 raise PermissionError( 2924 f"The given dataset ({datasetId}) isn't accessible with this " 2925 "user. This might be caused by a typo in the dataset Synapse ID." 2926 ) 2927 2928 # If not, then assume dataset not in file view 2929 raise LookupError( 2930 f"The given dataset ({datasetId}) doesn't appear in the " 2931 f"configured file view ({self.storageFileview}). This might " 2932 "mean that the file view's scope needs to be updated." 2933 ) 2934 2935 def getDatasetAnnotationsBatch( 2936 self, datasetId: str, dataset_file_ids: Sequence[str] = None 2937 ) -> pd.DataFrame: 2938 """Generate table for annotations across all files in given dataset. 2939 This function uses a temporary file view to generate a table 2940 instead of iteratively querying for individual entity annotations. 2941 This function is expected to run much faster than 2942 `self.getDatasetAnnotationsBatch` on large datasets. 2943 2944 Args: 2945 datasetId (str): Synapse ID for dataset folder. 2946 dataset_file_ids (Sequence[str]): List of Synapse IDs 2947 for dataset files/folders used to subset the table. 2948 2949 Returns: 2950 pd.DataFrame: Table of annotations. 2951 """ 2952 # Create data frame from annotations file view 2953 with DatasetFileView(datasetId, self.syn) as fileview: 2954 table = fileview.query() 2955 2956 if dataset_file_ids: 2957 table = table.loc[table.index.intersection(dataset_file_ids)] 2958 2959 table = table.reset_index(drop=True) 2960 2961 return table 2962 2963 def _get_table_schema_by_cname(self, table_schema): 2964 # assume no duplicate column names in the table 2965 table_schema_by_cname = {} 2966 2967 for col_record in table_schema: 2968 # TODO clean up dictionary for compactness (e.g. remove redundant 'name' key) 2969 table_schema_by_cname[col_record["name"]] = col_record 2970 2971 return table_schema_by_cname 2972 2973 2974class TableOperations: 2975 """ 2976 Object to hold functions for various table operations specific to the Synapse Asset Store. 2977 2978 Currently implement operations are: 2979 createTable: upload a manifest as a new table when none exist 2980 replaceTable: replace a metadata in a table from one manifest with metadata from another manifest 2981 updateTable: add a column to a table that already exists on synapse 2982 2983 Operations currently in development are: 2984 upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest 2985 """ 2986 2987 def __init__( 2988 self, 2989 synStore: SynapseStorage, 2990 tableToLoad: pd.DataFrame = None, 2991 tableName: str = None, 2992 datasetId: str = None, 2993 existingTableId: str = None, 2994 restrict: bool = False, 2995 synapse_entity_tracker: SynapseEntityTracker = None, 2996 ): 2997 """ 2998 Class governing table operations (creation, replacement, upserts, updates) in schematic 2999 3000 tableToLoad: manifest formatted appropriately for the table 3001 tableName: name of the table to be uploaded 3002 datasetId: synID of the dataset for the manifest 3003 existingTableId: synId of the table currently exising on synapse (if there is one) 3004 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 3005 synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities 3006 3007 """ 3008 self.synStore = synStore 3009 self.tableToLoad = tableToLoad 3010 self.tableName = tableName 3011 self.datasetId = datasetId 3012 self.existingTableId = existingTableId 3013 self.restrict = restrict 3014 self.synapse_entity_tracker = synapse_entity_tracker or SynapseEntityTracker() 3015 3016 @tracer.start_as_current_span("TableOperations::createTable") 3017 def createTable( 3018 self, 3019 columnTypeDict: dict = None, 3020 specifySchema: bool = True, 3021 ): 3022 """ 3023 Method to create a table from a metadata manifest and upload it to synapse 3024 3025 Args: 3026 columnTypeDict: dictionary schema for table columns: type, size, etc 3027 specifySchema: to specify a specific schema for the table format 3028 3029 Returns: 3030 table.schema.id: synID of the newly created table 3031 """ 3032 datasetEntity = self.synapse_entity_tracker.get( 3033 synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False 3034 ) 3035 datasetName = datasetEntity.name 3036 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3037 3038 if not self.tableName: 3039 self.tableName = datasetName + "table" 3040 datasetParentProject = self.synStore.getDatasetProject(self.datasetId) 3041 if specifySchema: 3042 if columnTypeDict == {}: 3043 logger.error("Did not provide a columnTypeDict.") 3044 # create list of columns: 3045 cols = [] 3046 for col in self.tableToLoad.columns: 3047 if col in table_schema_by_cname: 3048 col_type = table_schema_by_cname[col]["columnType"] 3049 max_size = ( 3050 table_schema_by_cname[col]["maximumSize"] 3051 if "maximumSize" in table_schema_by_cname[col].keys() 3052 else 100 3053 ) 3054 max_list_len = 250 3055 if max_size and max_list_len: 3056 cols.append( 3057 Column( 3058 name=col, 3059 columnType=col_type, 3060 maximumSize=max_size, 3061 maximumListLength=max_list_len, 3062 ) 3063 ) 3064 elif max_size: 3065 cols.append( 3066 Column(name=col, columnType=col_type, maximumSize=max_size) 3067 ) 3068 else: 3069 cols.append(Column(name=col, columnType=col_type)) 3070 else: 3071 # TODO add warning that the given col was not found and it's max size is set to 100 3072 cols.append(Column(name=col, columnType="STRING", maximumSize=100)) 3073 schema = Schema( 3074 name=self.tableName, columns=cols, parent=datasetParentProject 3075 ) 3076 table = Table(schema, self.tableToLoad) 3077 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3078 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3079 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3080 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3081 return table.schema.id 3082 else: 3083 # For just uploading the tables to synapse using default 3084 # column types. 3085 table = build_table(self.tableName, datasetParentProject, self.tableToLoad) 3086 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3087 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3088 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3089 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3090 return table.schema.id 3091 3092 @tracer.start_as_current_span("TableOperations::replaceTable") 3093 def replaceTable( 3094 self, 3095 specifySchema: bool = True, 3096 columnTypeDict: dict = None, 3097 ): 3098 """ 3099 Method to replace an existing table on synapse with metadata from a new manifest 3100 3101 Args: 3102 specifySchema: to infer a schema for the table format 3103 columnTypeDict: dictionary schema for table columns: type, size, etc 3104 3105 Returns: 3106 existingTableId: synID of the already existing table that had its metadata replaced 3107 """ 3108 datasetEntity = self.synapse_entity_tracker.get( 3109 synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False 3110 ) 3111 3112 datasetName = datasetEntity.name 3113 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3114 existing_table, existing_results = self.synStore.get_synapse_table( 3115 self.existingTableId 3116 ) 3117 # remove rows 3118 self.synStore.syn.delete(existing_results) 3119 # Data changes such as removing all rows causes the eTag to change. 3120 self.synapse_entity_tracker.remove(synapse_id=self.existingTableId) 3121 # wait for row deletion to finish on synapse before getting empty table 3122 sleep(10) 3123 3124 # removes all current columns 3125 current_table = self.synapse_entity_tracker.get( 3126 synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False 3127 ) 3128 3129 current_columns = self.synStore.syn.getTableColumns(current_table) 3130 for col in current_columns: 3131 current_table.removeColumn(col) 3132 3133 if not self.tableName: 3134 self.tableName = datasetName + "table" 3135 3136 # Process columns according to manifest entries 3137 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3138 datasetParentProject = self.synStore.getDatasetProject(self.datasetId) 3139 if specifySchema: 3140 if columnTypeDict == {}: 3141 logger.error("Did not provide a columnTypeDict.") 3142 # create list of columns: 3143 cols = [] 3144 3145 for col in self.tableToLoad.columns: 3146 if col in table_schema_by_cname: 3147 col_type = table_schema_by_cname[col]["columnType"] 3148 max_size = ( 3149 table_schema_by_cname[col]["maximumSize"] 3150 if "maximumSize" in table_schema_by_cname[col].keys() 3151 else 100 3152 ) 3153 max_list_len = 250 3154 if max_size and max_list_len: 3155 cols.append( 3156 Column( 3157 name=col, 3158 columnType=col_type, 3159 maximumSize=max_size, 3160 maximumListLength=max_list_len, 3161 ) 3162 ) 3163 elif max_size: 3164 cols.append( 3165 Column(name=col, columnType=col_type, maximumSize=max_size) 3166 ) 3167 else: 3168 cols.append(Column(name=col, columnType=col_type)) 3169 else: 3170 # TODO add warning that the given col was not found and it's max size is set to 100 3171 cols.append(Column(name=col, columnType="STRING", maximumSize=100)) 3172 3173 # adds new columns to schema 3174 for col in cols: 3175 current_table.addColumn(col) 3176 table_result = self.synStore.syn.store( 3177 current_table, isRestricted=self.restrict 3178 ) 3179 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3180 # self.synapse_entity_tracker.add(synapse_id=table_result.schema.id, entity=table_result.schema) 3181 self.synapse_entity_tracker.remove(synapse_id=table_result.id) 3182 3183 # wait for synapse store to finish 3184 sleep(1) 3185 3186 # build schema and table from columns and store with necessary restrictions 3187 schema = Schema( 3188 name=self.tableName, columns=cols, parent=datasetParentProject 3189 ) 3190 schema.id = self.existingTableId 3191 table = Table(schema, self.tableToLoad, etag=existing_results.etag) 3192 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3193 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3194 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3195 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3196 else: 3197 logging.error("Must specify a schema for table replacements") 3198 3199 # remove system metadata from manifest 3200 existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True) 3201 return self.existingTableId 3202 3203 @tracer.start_as_current_span("TableOperations::_get_auth_token") 3204 def _get_auth_token( 3205 self, 3206 ): 3207 authtoken = None 3208 3209 # Get access token from environment variable if available 3210 # Primarily useful for testing environments, with other possible usefulness for containers 3211 env_access_token = os.getenv("SYNAPSE_ACCESS_TOKEN") 3212 if env_access_token: 3213 authtoken = env_access_token 3214 return authtoken 3215 3216 # Get token from authorization header 3217 # Primarily useful for API endpoint functionality 3218 if "Authorization" in self.synStore.syn.default_headers: 3219 authtoken = self.synStore.syn.default_headers["Authorization"].split( 3220 "Bearer " 3221 )[-1] 3222 return authtoken 3223 3224 # retrive credentials from synapse object 3225 # Primarily useful for local users, could only be stored here when a .synapseConfig file is used, but including to be safe 3226 synapse_object_creds = self.synStore.syn.credentials 3227 if hasattr(synapse_object_creds, "_token"): 3228 authtoken = synapse_object_creds.secret 3229 3230 # Try getting creds from .synapseConfig file if it exists 3231 # Primarily useful for local users. Seems to correlate with credentials stored in synaspe object when logged in 3232 if os.path.exists(CONFIG.synapse_configuration_path): 3233 config = get_config_file(CONFIG.synapse_configuration_path) 3234 3235 # check which credentials are provided in file 3236 if config.has_option("authentication", "authtoken"): 3237 authtoken = config.get("authentication", "authtoken") 3238 3239 # raise error if required credentials are not found 3240 if not authtoken: 3241 raise NameError( 3242 "authtoken credentials could not be found in the environment, synapse object, or the .synapseConfig file" 3243 ) 3244 3245 return authtoken 3246 3247 @tracer.start_as_current_span("TableOperations::upsertTable") 3248 def upsertTable(self, dmge: DataModelGraphExplorer): 3249 """ 3250 Method to upsert rows from a new manifest into an existing table on synapse 3251 For upsert functionality to work, primary keys must follow the naming convention of <componenet>_id 3252 `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality. 3253 Currently it is required to use -dl/--use_display_label with table upserts. 3254 3255 3256 Args: 3257 dmge: DataModelGraphExplorer instance 3258 3259 Returns: 3260 existingTableId: synID of the already existing table that had its metadata replaced 3261 """ 3262 3263 authtoken = self._get_auth_token() 3264 3265 synapseDB = SynapseDatabase( 3266 auth_token=authtoken, 3267 project_id=self.synStore.getDatasetProject(self.datasetId), 3268 syn=self.synStore.syn, 3269 synapse_entity_tracker=self.synapse_entity_tracker, 3270 ) 3271 3272 try: 3273 # Try performing upsert 3274 synapseDB.upsert_table_rows( 3275 table_name=self.tableName, data=self.tableToLoad 3276 ) 3277 except SynapseHTTPError as ex: 3278 # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload 3279 if "Id is not a valid column name or id" in str(ex): 3280 self._update_table_uuid_column(dmge) 3281 synapseDB.upsert_table_rows( 3282 table_name=self.tableName, data=self.tableToLoad 3283 ) 3284 # Raise if other error 3285 else: 3286 raise ex 3287 3288 return self.existingTableId 3289 3290 @tracer.start_as_current_span("TableOperations::_update_table_uuid_column") 3291 def _update_table_uuid_column( 3292 self, 3293 dmge: DataModelGraphExplorer, 3294 ) -> None: 3295 """Removes the `Uuid` column when present, and relpaces with an `Id` column 3296 Used to enable backwards compatability for manifests using the old `Uuid` convention 3297 3298 Args: 3299 dmge: DataModelGraphExplorer instance 3300 3301 Returns: 3302 None 3303 """ 3304 3305 # Get the columns of the schema 3306 schema = self.synapse_entity_tracker.get( 3307 synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False 3308 ) 3309 3310 cols = self.synStore.syn.getTableColumns(schema) 3311 3312 # Iterate through columns until `Uuid` column is found 3313 for col in cols: 3314 if col.name.lower() == "uuid": 3315 # See if schema has `Uuid` column specified 3316 try: 3317 uuid_col_in_schema = dmge.is_class_in_schema(col.name) 3318 except KeyError: 3319 uuid_col_in_schema = False 3320 3321 # If there is, then create a new `Id` column from scratch 3322 if uuid_col_in_schema: 3323 new_col = Column(columnType="STRING", maximumSize=64, name="Id") 3324 schema.addColumn(new_col) 3325 schema = self.synStore.syn.store(schema) 3326 # self.synapse_entity_tracker.add(synapse_id=schema.id, entity=schema) 3327 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3328 self.synapse_entity_tracker.remove(synapse_id=schema.id) 3329 # If there is not, then use the old `Uuid` column as a basis for the new `Id` column 3330 else: 3331 # Build ColumnModel that will be used for new column 3332 id_column = Column( 3333 name="Id", 3334 columnType="STRING", 3335 maximumSize=64, 3336 defaultValue=None, 3337 maximumListLength=1, 3338 ) 3339 new_col_response = self.synStore.syn.store(id_column) 3340 3341 # Define columnChange body 3342 columnChangeDict = { 3343 "concreteType": "org.sagebionetworks.repo.model.table.TableSchemaChangeRequest", 3344 "entityId": self.existingTableId, 3345 "changes": [ 3346 { 3347 "oldColumnId": col["id"], 3348 "newColumnId": new_col_response["id"], 3349 } 3350 ], 3351 } 3352 3353 self.synStore.syn._async_table_update( 3354 table=self.existingTableId, 3355 changes=[columnChangeDict], 3356 wait=False, 3357 ) 3358 break 3359 3360 return 3361 3362 @tracer.start_as_current_span("TableOperations::updateTable") 3363 def updateTable( 3364 self, 3365 update_col: str = "Id", 3366 ): 3367 """ 3368 Method to update an existing table with a new column 3369 3370 Args: 3371 updateCol: column to index the old and new tables on 3372 3373 Returns: 3374 existingTableId: synID of the already existing table that had its metadata replaced 3375 """ 3376 existing_table, existing_results = self.synStore.get_synapse_table( 3377 self.existingTableId 3378 ) 3379 3380 self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col) 3381 # store table with existing etag data and impose restrictions as appropriate 3382 table_result = self.synStore.syn.store( 3383 Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag), 3384 isRestricted=self.restrict, 3385 ) 3386 # We cannot store the Table to the `synapse_entity_tracker` because there is 3387 # not `Schema` on the table object. The above `.store()` function call would 3388 # also update the ETag of the entity within Synapse. Remove it from the tracker 3389 # and re-retrieve it later on if needed again. 3390 self.synapse_entity_tracker.remove(synapse_id=table_result.tableId) 3391 3392 return self.existingTableId 3393 3394 3395class DatasetFileView: 3396 """Helper class to create temporary dataset file views. 3397 This class can be used in conjunction with a 'with' statement. 3398 This will ensure that the file view is deleted automatically. 3399 See SynapseStorage.getDatasetAnnotationsBatch for example usage. 3400 """ 3401 3402 def __init__( 3403 self, 3404 datasetId: str, 3405 synapse: Synapse, 3406 name: str = None, 3407 temporary: bool = True, 3408 parentId: str = None, 3409 ) -> None: 3410 """Create a file view scoped to a dataset folder. 3411 3412 Args: 3413 datasetId (str): Synapse ID for a dataset folder/project. 3414 synapse (Synapse): Used for Synapse requests. 3415 name (str): Name of the file view (temporary or not). 3416 temporary (bool): Whether to delete the file view on exit 3417 of either a 'with' statement or Python entirely. 3418 parentId (str, optional): Synapse ID specifying where to 3419 store the file view. Defaults to datasetId. 3420 """ 3421 3422 self.datasetId = datasetId 3423 self.synapse = synapse 3424 self.is_temporary = temporary 3425 3426 if name is None: 3427 self.name = f"schematic annotation file view for {self.datasetId}" 3428 3429 if self.is_temporary: 3430 uid = secrets.token_urlsafe(5) 3431 self.name = f"{self.name} - UID {uid}" 3432 3433 # TODO: Allow a DCC admin to configure a "universal parent" 3434 # Such as a Synapse project writeable by everyone. 3435 self.parentId = datasetId if parentId is None else parentId 3436 3437 # TODO: Create local sharing setting to hide from everyone else 3438 view_schema = EntityViewSchema( 3439 name=self.name, 3440 parent=self.parentId, 3441 scopes=self.datasetId, 3442 includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER], 3443 addDefaultViewColumns=False, 3444 addAnnotationColumns=True, 3445 ) 3446 3447 # TODO: Handle failure due to insufficient permissions by 3448 # creating a temporary new project to store view 3449 self.view_schema = self.synapse.store(view_schema) 3450 3451 # These are filled in after calling `self.query()` 3452 self.results = None 3453 self.table = None 3454 3455 # Ensure deletion of the file view (last resort) 3456 if self.is_temporary: 3457 atexit.register(self.delete) 3458 3459 def __enter__(self): 3460 """Return file view when entering 'with' statement.""" 3461 return self 3462 3463 def __exit__(self, exc_type, exc_value, traceback): 3464 """Delete file view when exiting 'with' statement.""" 3465 if self.is_temporary: 3466 self.delete() 3467 3468 def delete(self): 3469 """Delete the file view on Synapse without deleting local table.""" 3470 if self.view_schema is not None: 3471 self.synapse.delete(self.view_schema) 3472 self.view_schema = None 3473 3474 def query(self, tidy=True, force=False): 3475 """Retrieve file view as a data frame (raw format sans index).""" 3476 if self.table is None or force: 3477 fileview_id = self.view_schema["id"] 3478 self.results = self.synapse.tableQuery(f"select * from {fileview_id}") 3479 self.table = self.results.asDataFrame( 3480 rowIdAndVersionInIndex=False, 3481 na_values=STR_NA_VALUES_FILTERED, 3482 keep_default_na=False, 3483 ) 3484 if tidy: 3485 self.tidy_table() 3486 return self.table 3487 3488 def tidy_table(self): 3489 """Convert raw file view data frame into more usable format.""" 3490 assert self.table is not None, "Must call `self.query()` first." 3491 self._fix_default_columns() 3492 self._fix_list_columns() 3493 self._fix_int_columns() 3494 return self.table 3495 3496 def _fix_default_columns(self): 3497 """Rename default columns to match schematic expectations.""" 3498 3499 # Drop ROW_VERSION column if present 3500 if "ROW_VERSION" in self.table: 3501 del self.table["ROW_VERSION"] 3502 3503 # Rename id column to entityId and set as data frame index 3504 if "ROW_ID" in self.table: 3505 self.table["entityId"] = "syn" + self.table["ROW_ID"].astype(str) 3506 self.table = self.table.set_index("entityId", drop=False) 3507 del self.table["ROW_ID"] 3508 3509 # Rename ROW_ETAG column to eTag and place at end of data frame 3510 if "ROW_ETAG" in self.table: 3511 row_etags = self.table.pop("ROW_ETAG") 3512 3513 # eTag column may already present if users annotated data without submitting manifest 3514 # we're only concerned with the new values and not the existing ones 3515 if "eTag" in self.table: 3516 del self.table["eTag"] 3517 3518 self.table.insert(len(self.table.columns), "eTag", row_etags) 3519 3520 return self.table 3521 3522 def _get_columns_of_type(self, types): 3523 """Helper function to get list of columns of a given type(s).""" 3524 matching_columns = [] 3525 for header in self.results.headers: 3526 if header.columnType in types: 3527 matching_columns.append(header.name) 3528 return matching_columns 3529 3530 def _fix_list_columns(self): 3531 """Fix formatting of list-columns.""" 3532 list_types = {"STRING_LIST", "INTEGER_LIST", "BOOLEAN_LIST"} 3533 list_columns = self._get_columns_of_type(list_types) 3534 for col in list_columns: 3535 self.table[col] = self.table[col].apply(lambda x: ", ".join(x)) 3536 return self.table 3537 3538 def _fix_int_columns(self): 3539 """Ensure that integer-columns are actually integers.""" 3540 int_columns = self._get_columns_of_type({"INTEGER"}) 3541 for col in int_columns: 3542 # Coercing to string because NaN is a floating point value 3543 # and cannot exist alongside integers in a column 3544 def to_int_fn(x): 3545 return "" if np.isnan(x) else str(int(x)) 3546 3547 self.table[col] = self.table[col].apply(to_int_fn) 3548 return self.table
86@dataclass 87class ManifestDownload(object): 88 """ 89 syn: an object of type synapseclient. 90 manifest_id: id of a manifest 91 synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities 92 """ 93 94 syn: synapseclient.Synapse 95 manifest_id: str 96 synapse_entity_tracker: SynapseEntityTracker = field( 97 default_factory=SynapseEntityTracker 98 ) 99 100 def _download_manifest_to_folder(self, use_temporary_folder: bool = True) -> File: 101 """ 102 Try downloading a manifest to a specific folder (temporary or not). When the 103 `use_temporary_folder` is set to True, the manifest will be downloaded to a 104 temporary folder. This is useful for when the code is running as an API server 105 where multiple requests are being made at the same time. This will prevent 106 multiple requests from overwriting the same manifest file. When the 107 `use_temporary_folder` is set to False, the manifest will be downloaded to the 108 default manifest folder. 109 110 Args: 111 use_temporary_folder: boolean argument indicating if a temporary folder 112 should be used to store the manifest file. This is useful when running 113 this code as an API server where multiple requests could be made at the 114 same time. This is set to False when the code is being used from the 115 CLI. Defaults to True. 116 117 Return: 118 manifest_data: A Synapse file entity of the downloaded manifest 119 """ 120 manifest_data = self.synapse_entity_tracker.get( 121 synapse_id=self.manifest_id, 122 syn=self.syn, 123 download_file=False, 124 retrieve_if_not_present=False, 125 ) 126 current_span = trace.get_current_span() 127 if ( 128 manifest_data 129 and (file_handle := manifest_data.get("_file_handle", None)) 130 and current_span.is_recording() 131 ): 132 current_span.set_attribute( 133 "schematic.manifest_size", file_handle.get("contentSize", 0) 134 ) 135 136 if manifest_data and manifest_data.path: 137 return manifest_data 138 139 if "SECRETS_MANAGER_SECRETS" in os.environ: 140 temporary_manifest_storage = "/var/tmp/temp_manifest_download" 141 cleanup_temporary_storage( 142 temporary_manifest_storage, time_delta_seconds=3600 143 ) 144 # create a new directory to store manifest 145 if not os.path.exists(temporary_manifest_storage): 146 os.mkdir(temporary_manifest_storage) 147 # create temporary folders for storing manifests 148 download_location = create_temp_folder( 149 path=temporary_manifest_storage, 150 prefix=f"{self.manifest_id}-{time.time()}-", 151 ) 152 else: 153 if use_temporary_folder: 154 download_location = create_temp_folder( 155 path=CONFIG.manifest_folder, 156 prefix=f"{self.manifest_id}-{time.time()}-", 157 ) 158 else: 159 download_location = CONFIG.manifest_folder 160 161 manifest_data = self.synapse_entity_tracker.get( 162 synapse_id=self.manifest_id, 163 syn=self.syn, 164 download_file=True, 165 retrieve_if_not_present=True, 166 download_location=download_location, 167 ) 168 169 # This is doing a rename of the downloaded file. The reason this is important 170 # is that if we are re-using a file that was previously downloaded, but the 171 # file had been renamed. The file downloaded from the Synapse client is just 172 # a direct copy of that renamed file. This code will set the name of the file 173 # to the original name that was used to download the file. Note: An MD5 checksum 174 # of the file will still be performed so if the file has changed, it will be 175 # downloaded again. 176 filename = manifest_data._file_handle.fileName 177 if filename != os.path.basename(manifest_data.path): 178 parent_folder = os.path.dirname(manifest_data.path) 179 manifest_original_name_and_path = os.path.join(parent_folder, filename) 180 181 self.syn.cache.remove( 182 file_handle_id=manifest_data.dataFileHandleId, path=manifest_data.path 183 ) 184 os.rename(manifest_data.path, manifest_original_name_and_path) 185 manifest_data.path = manifest_original_name_and_path 186 self.syn.cache.add( 187 file_handle_id=manifest_data.dataFileHandleId, 188 path=manifest_original_name_and_path, 189 md5=manifest_data._file_handle.contentMd5, 190 ) 191 192 return manifest_data 193 194 def _entity_type_checking(self) -> str: 195 """ 196 check the entity type of the id that needs to be downloaded 197 Return: 198 if the entity type is wrong, raise an error 199 """ 200 # check the type of entity 201 entity_type = entity_type_mapping( 202 syn=self.syn, 203 entity_id=self.manifest_id, 204 synapse_entity_tracker=self.synapse_entity_tracker, 205 ) 206 if entity_type != "file": 207 logger.error( 208 f"You are using entity type: {entity_type}. Please provide a file ID" 209 ) 210 211 def download_manifest( 212 self, 213 newManifestName: str = "", 214 manifest_df: pd.DataFrame = pd.DataFrame(), 215 use_temporary_folder: bool = True, 216 ) -> Union[str, File]: 217 """ 218 Download a manifest based on a given manifest id. 219 Args: 220 newManifestName(optional): new name of a manifest that gets downloaded. 221 manifest_df(optional): a dataframe containing name and id of manifests in a given asset view 222 Return: 223 manifest_data: synapse entity file object 224 """ 225 226 # enables retrying if user does not have access to uncensored manifest 227 # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location 228 manifest_data = "" 229 230 # check entity type 231 self._entity_type_checking() 232 233 # download a manifest 234 try: 235 manifest_data = self._download_manifest_to_folder( 236 use_temporary_folder=use_temporary_folder 237 ) 238 except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError): 239 # if there's an error getting an uncensored manifest, try getting the censored manifest 240 if not manifest_df.empty: 241 censored_regex = re.compile(".*censored.*") 242 censored = manifest_df["name"].str.contains(censored_regex) 243 new_manifest_id = manifest_df[censored]["id"][0] 244 self.manifest_id = new_manifest_id 245 try: 246 manifest_data = self._download_manifest_to_folder( 247 use_temporary_folder=use_temporary_folder 248 ) 249 except ( 250 SynapseUnmetAccessRestrictions, 251 SynapseAuthenticationError, 252 ) as e: 253 raise PermissionError( 254 "You don't have access to censored and uncensored manifests in this dataset." 255 ) from e 256 else: 257 logger.error( 258 f"You don't have access to the requested resource: {self.manifest_id}" 259 ) 260 261 if newManifestName and os.path.exists(manifest_data.get("path")): 262 # Rename the file we just made to the new name 263 new_manifest_filename = newManifestName + ".csv" 264 265 # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest. 266 parent_folder = os.path.dirname(manifest_data.get("path")) 267 268 new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename) 269 270 # Copy file to new location. The purpose of using a copy instead of a rename 271 # is to avoid any potential issues with the file being used in another 272 # process. This avoids any potential race or code cocurrency conditions. 273 shutil.copyfile(src=manifest_data["path"], dst=new_manifest_path_name) 274 275 # Adding this to cache will allow us to re-use the already downloaded 276 # manifest file for up to 1 hour. 277 self.syn.cache.add( 278 file_handle_id=manifest_data.dataFileHandleId, 279 path=new_manifest_path_name, 280 md5=manifest_data._file_handle.contentMd5, 281 ) 282 283 # Update file names/paths in manifest_data 284 manifest_data["name"] = new_manifest_filename 285 manifest_data["filename"] = new_manifest_filename 286 manifest_data["path"] = new_manifest_path_name 287 288 return manifest_data
syn: an object of type synapseclient. manifest_id: id of a manifest synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
211 def download_manifest( 212 self, 213 newManifestName: str = "", 214 manifest_df: pd.DataFrame = pd.DataFrame(), 215 use_temporary_folder: bool = True, 216 ) -> Union[str, File]: 217 """ 218 Download a manifest based on a given manifest id. 219 Args: 220 newManifestName(optional): new name of a manifest that gets downloaded. 221 manifest_df(optional): a dataframe containing name and id of manifests in a given asset view 222 Return: 223 manifest_data: synapse entity file object 224 """ 225 226 # enables retrying if user does not have access to uncensored manifest 227 # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location 228 manifest_data = "" 229 230 # check entity type 231 self._entity_type_checking() 232 233 # download a manifest 234 try: 235 manifest_data = self._download_manifest_to_folder( 236 use_temporary_folder=use_temporary_folder 237 ) 238 except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError): 239 # if there's an error getting an uncensored manifest, try getting the censored manifest 240 if not manifest_df.empty: 241 censored_regex = re.compile(".*censored.*") 242 censored = manifest_df["name"].str.contains(censored_regex) 243 new_manifest_id = manifest_df[censored]["id"][0] 244 self.manifest_id = new_manifest_id 245 try: 246 manifest_data = self._download_manifest_to_folder( 247 use_temporary_folder=use_temporary_folder 248 ) 249 except ( 250 SynapseUnmetAccessRestrictions, 251 SynapseAuthenticationError, 252 ) as e: 253 raise PermissionError( 254 "You don't have access to censored and uncensored manifests in this dataset." 255 ) from e 256 else: 257 logger.error( 258 f"You don't have access to the requested resource: {self.manifest_id}" 259 ) 260 261 if newManifestName and os.path.exists(manifest_data.get("path")): 262 # Rename the file we just made to the new name 263 new_manifest_filename = newManifestName + ".csv" 264 265 # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest. 266 parent_folder = os.path.dirname(manifest_data.get("path")) 267 268 new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename) 269 270 # Copy file to new location. The purpose of using a copy instead of a rename 271 # is to avoid any potential issues with the file being used in another 272 # process. This avoids any potential race or code cocurrency conditions. 273 shutil.copyfile(src=manifest_data["path"], dst=new_manifest_path_name) 274 275 # Adding this to cache will allow us to re-use the already downloaded 276 # manifest file for up to 1 hour. 277 self.syn.cache.add( 278 file_handle_id=manifest_data.dataFileHandleId, 279 path=new_manifest_path_name, 280 md5=manifest_data._file_handle.contentMd5, 281 ) 282 283 # Update file names/paths in manifest_data 284 manifest_data["name"] = new_manifest_filename 285 manifest_data["filename"] = new_manifest_filename 286 manifest_data["path"] = new_manifest_path_name 287 288 return manifest_data
Download a manifest based on a given manifest id.
Arguments:
- newManifestName(optional): new name of a manifest that gets downloaded.
- manifest_df(optional): a dataframe containing name and id of manifests in a given asset view
Return:
manifest_data: synapse entity file object
291class SynapseStorage(BaseStorage): 292 """Implementation of Storage interface for datasets/files stored on Synapse. 293 Provides utilities to list files in a specific project; update files annotations, create fileviews, etc. 294 295 TODO: Need to define the interface and rename and/or refactor some of the methods below. 296 """ 297 298 @tracer.start_as_current_span("SynapseStorage::__init__") 299 def __init__( 300 self, 301 token: Optional[str] = None, # optional parameter retrieved from browser cookie 302 access_token: Optional[str] = None, 303 project_scope: Optional[list] = None, 304 synapse_cache_path: Optional[str] = None, 305 perform_query: Optional[bool] = True, 306 columns: Optional[list] = None, 307 where_clauses: Optional[list] = None, 308 ) -> None: 309 """Initializes a SynapseStorage object. 310 311 Args: 312 token (Optional[str], optional): 313 Optional token parameter as found in browser cookie upon login to synapse. 314 Defaults to None. 315 access_token (Optional[list], optional): 316 Optional access token (personal or oauth). 317 Defaults to None. 318 project_scope (Optional[list], optional): Defaults to None. 319 synapse_cache_path (Optional[str], optional): 320 Location of synapse cache. 321 Defaults to None. 322 TODO: 323 Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how `query_fileview` is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands. 324 """ 325 self.syn = self.login(synapse_cache_path, access_token) 326 self.project_scope = project_scope 327 self.storageFileview = CONFIG.synapse_master_fileview_id 328 self.manifest = CONFIG.synapse_manifest_basename 329 self.root_synapse_cache = self.syn.cache.cache_root_dir 330 self.synapse_entity_tracker = SynapseEntityTracker() 331 if perform_query: 332 self.query_fileview(columns=columns, where_clauses=where_clauses) 333 334 # TODO: When moving this over to a regular cron-job the following logic should be 335 # out of `manifest_download`: 336 # if "SECRETS_MANAGER_SECRETS" in os.environ: 337 # temporary_manifest_storage = "/var/tmp/temp_manifest_download" 338 # cleanup_temporary_storage(temporary_manifest_storage, time_delta_seconds=3600) 339 @tracer.start_as_current_span("SynapseStorage::_purge_synapse_cache") 340 def _purge_synapse_cache( 341 self, maximum_storage_allowed_cache_gb: int = 1, minute_buffer: int = 15 342 ) -> None: 343 """ 344 Purge synapse cache if it exceeds a certain size. Default to 1GB. 345 Args: 346 maximum_storage_allowed_cache_gb (int): the maximum storage allowed 347 before purging cache. Default is 1 GB. 348 minute_buffer (int): All files created this amount of time or older will be deleted 349 """ 350 # try clearing the cache 351 # scan a directory and check size of files 352 if os.path.exists(self.root_synapse_cache): 353 maximum_storage_allowed_cache_bytes = maximum_storage_allowed_cache_gb * ( 354 1024**3 355 ) 356 nbytes = get_dir_size(self.root_synapse_cache) 357 dir_size_bytes = check_synapse_cache_size(directory=self.root_synapse_cache) 358 # if 1 GB has already been taken, purge cache before 15 min 359 if dir_size_bytes >= maximum_storage_allowed_cache_bytes: 360 num_of_deleted_files = clear_synapse_cache( 361 self.syn.cache, minutes=minute_buffer 362 ) 363 logger.info( 364 f"{num_of_deleted_files} files have been deleted from {self.root_synapse_cache}" 365 ) 366 else: 367 # on AWS, OS takes around 14-17% of our ephemeral storage (20GiB) 368 # instead of guessing how much space that we left, print out .synapseCache here 369 logger.info(f"the total size of .synapseCache is: {nbytes} bytes") 370 371 @tracer.start_as_current_span("SynapseStorage::query_fileview") 372 def query_fileview( 373 self, 374 columns: Optional[list] = None, 375 where_clauses: Optional[list] = None, 376 force_requery: Optional[bool] = False, 377 ) -> None: 378 """ 379 Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute. 380 Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes. 381 Args: 382 columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns. 383 where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None. 384 force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False. 385 """ 386 self._purge_synapse_cache() 387 388 # Initialize to assume that the new fileview query will be different from what may already be stored. Initializes to True because generally one will not have already been performed 389 self.new_query_different = True 390 391 # If a query has already been performed, store the query 392 previous_query_built = hasattr(self, "fileview_query") 393 if previous_query_built: 394 previous_query = self.fileview_query 395 396 # Build a query with the current given parameters and check to see if it is different from the previous 397 self._build_query(columns=columns, where_clauses=where_clauses) 398 if previous_query_built: 399 self.new_query_different = self.fileview_query != previous_query 400 401 # Only perform the query if it is different from the previous query or we are forcing new results to be retrieved 402 if self.new_query_different or force_requery: 403 try: 404 self.storageFileviewTable = self.syn.tableQuery( 405 query=self.fileview_query, 406 ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) 407 except SynapseHTTPError as exc: 408 exception_text = str(exc) 409 if "Unknown column path" in exception_text: 410 raise ValueError( 411 "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation." 412 ) 413 elif "Unknown column" in exception_text: 414 missing_column = exception_text.split("Unknown column ")[-1] 415 raise ValueError( 416 f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview." 417 ) 418 else: 419 raise AccessCredentialsError(self.storageFileview) 420 421 @staticmethod 422 def build_clause_from_dataset_id( 423 dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None 424 ) -> str: 425 """ 426 Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized. 427 Args: 428 dataset_id: Synapse ID of a dataset that should be used to limit the query 429 dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query 430 Returns: 431 clause for the query or an empty string if no dataset ID is provided 432 """ 433 # Calling this method without specifying synIDs will complete but will not scope the view 434 if (not dataset_id) and (not dataset_folder_list): 435 return "" 436 437 # This will be used to gather files under a dataset recursively with a fileview query instead of walking 438 if dataset_folder_list: 439 search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list) 440 return f"parentId IN ({search_folders})" 441 442 # `dataset_id` should be provided when all files are stored directly under the dataset folder 443 return f"parentId='{dataset_id}'" 444 445 def _build_query( 446 self, columns: Optional[list] = None, where_clauses: Optional[list] = None 447 ): 448 """ 449 Method to build a query for Synapse FileViews 450 Args: 451 columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns. 452 where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None. 453 self.storageFileview (str): Synapse FileView ID 454 self.project_scope (Optional[list], optional): List of project IDs to be used to scope the query. Defaults to None. 455 Gets added to where_clauses, more included for backwards compatability and as a more user friendly way of subsetting the view in a simple way. 456 """ 457 if columns is None: 458 columns = [] 459 if where_clauses is None: 460 where_clauses = [] 461 462 if self.project_scope: 463 project_scope_clause = f"projectId IN {tuple(self.project_scope + [''])}" 464 where_clauses.append(project_scope_clause) 465 466 if where_clauses: 467 where_clauses = " AND ".join(where_clauses) 468 where_clauses = f"WHERE {where_clauses} ;" 469 else: 470 where_clauses = ";" 471 472 if columns: 473 columns = ",".join(columns) 474 else: 475 columns = "*" 476 477 self.fileview_query = ( 478 f"SELECT {columns} FROM {self.storageFileview} {where_clauses}" 479 ) 480 481 return 482 483 @staticmethod 484 @tracer.start_as_current_span("SynapseStorage::login") 485 def login( 486 synapse_cache_path: Optional[str] = None, 487 access_token: Optional[str] = None, 488 ) -> synapseclient.Synapse: 489 """Login to Synapse 490 491 Args: 492 access_token (Optional[str], optional): A synapse access token. Defaults to None. 493 synapse_cache_path (Optional[str]): location of synapse cache 494 495 Raises: 496 ValueError: If unable to loging with access token 497 498 Returns: 499 synapseclient.Synapse: A Synapse object that is logged in 500 """ 501 if not access_token: 502 access_token = os.getenv("SYNAPSE_ACCESS_TOKEN") 503 504 # login using a token 505 if access_token: 506 try: 507 syn = synapseclient.Synapse( 508 cache_root_dir=synapse_cache_path, 509 debug=False, 510 skip_checks=True, 511 cache_client=False, 512 ) 513 syn.login(authToken=access_token, silent=True) 514 except SynapseHTTPError as exc: 515 raise ValueError( 516 "No access to resources. Please make sure that your token is correct" 517 ) from exc 518 else: 519 # login using synapse credentials provided by user in .synapseConfig (default) file 520 syn = synapseclient.Synapse( 521 configPath=CONFIG.synapse_configuration_path, 522 cache_root_dir=synapse_cache_path, 523 debug=False, 524 skip_checks=True, 525 cache_client=False, 526 ) 527 syn.login(silent=True) 528 529 # set user id attribute 530 current_span = trace.get_current_span() 531 if current_span.is_recording(): 532 current_span.set_attribute("user.id", syn.credentials.owner_id) 533 534 return syn 535 536 def missing_entity_handler(method): 537 def wrapper(*args, **kwargs): 538 try: 539 return method(*args, **kwargs) 540 except SynapseHTTPError as ex: 541 str_message = str(ex).replace("\n", "") 542 if "trash" in str_message or "does not exist" in str_message: 543 logging.warning(str_message) 544 return None 545 else: 546 raise ex 547 548 return wrapper 549 550 def async_missing_entity_handler(method): 551 """Decorator to handle missing entities in async methods.""" 552 553 async def wrapper(*args: Any, **kwargs: Any) -> Any: 554 try: 555 return await method(*args, **kwargs) 556 except SynapseHTTPError as ex: 557 str_message = str(ex).replace("\n", "") 558 if "trash" in str_message or "does not exist" in str_message: 559 logging.warning(str_message) 560 return None 561 else: 562 raise ex 563 564 return wrapper 565 566 def getStorageFileviewTable(self): 567 """Returns the storageFileviewTable obtained during initialization.""" 568 return self.storageFileviewTable 569 570 def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]: 571 """Gets the paginated results of the REST call to Synapse to check what projects the current user has access to. 572 573 Args: 574 currentUserId: synapse id for the user whose projects we want to get. 575 576 Returns: 577 A dictionary with a next page token and the results. 578 """ 579 all_results = self.syn.restGET( 580 "/projects/user/{principalId}".format(principalId=currentUserId) 581 ) 582 583 while ( 584 "nextPageToken" in all_results 585 ): # iterate over next page token in results while there is any 586 results_token = self.syn.restGET( 587 "/projects/user/{principalId}?nextPageToken={nextPageToken}".format( 588 principalId=currentUserId, 589 nextPageToken=all_results["nextPageToken"], 590 ) 591 ) 592 all_results["results"].extend(results_token["results"]) 593 594 if "nextPageToken" in results_token: 595 all_results["nextPageToken"] = results_token["nextPageToken"] 596 else: 597 del all_results["nextPageToken"] 598 599 return all_results 600 601 @tracer.start_as_current_span("SynapseStorage::getStorageProjects") 602 def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]: 603 """Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute. 604 605 Returns: 606 A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName). 607 """ 608 609 # get the set of all storage Synapse project accessible for this pipeline 610 storageProjects = self.storageFileviewTable["projectId"].unique() 611 612 # get the set of storage Synapse project accessible for this user 613 # get a list of projects from Synapse 614 current_user_project_headers = self.synapse_entity_tracker.get_project_headers( 615 current_user_id=self.syn.credentials.owner_id, syn=self.syn 616 ) 617 project_id_to_name_dict = {} 618 current_user_projects = [] 619 for project_header in current_user_project_headers: 620 project_id_to_name_dict[project_header.get("id")] = project_header.get( 621 "name" 622 ) 623 current_user_projects.append(project_header.get("id")) 624 625 # find set of user projects that are also in this pipeline's storage projects set 626 storageProjects = list(set(storageProjects) & set(current_user_projects)) 627 628 # Limit projects to scope if specified 629 if project_scope: 630 storageProjects = list(set(storageProjects) & set(project_scope)) 631 632 if not storageProjects: 633 raise Warning( 634 f"There are no projects that the user has access to that match the criteria of the specified project scope: {project_scope}" 635 ) 636 637 # prepare a return list of project IDs and names 638 projects = [] 639 for projectId in storageProjects: 640 project_name_from_project_header = project_id_to_name_dict.get(projectId) 641 projects.append((projectId, project_name_from_project_header)) 642 643 sorted_projects_list = sorted(projects, key=lambda tup: tup[0]) 644 645 return sorted_projects_list 646 647 @tracer.start_as_current_span("SynapseStorage::getStorageDatasetsInProject") 648 def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]: 649 """Gets all datasets in folder under a given storage project that the current user has access to. 650 651 Args: 652 projectId: synapse ID of a storage project. 653 654 Returns: 655 A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName). 656 None: If the projectId cannot be found on Synapse. 657 """ 658 659 # select all folders and fetch their names from within the storage project; 660 # if folder content type is defined, only select folders that contain datasets 661 if "contentType" in self.storageFileviewTable.columns: 662 foldersTable = self.storageFileviewTable[ 663 (self.storageFileviewTable["contentType"] == "dataset") 664 & (self.storageFileviewTable["projectId"] == projectId) 665 ] 666 else: 667 foldersTable = self.storageFileviewTable[ 668 (self.storageFileviewTable["type"] == "folder") 669 & (self.storageFileviewTable["parentId"] == projectId) 670 ] 671 672 # get an array of tuples (folderId, folderName) 673 # some folders are part of datasets; others contain datasets 674 # each dataset parent is the project; folders part of a dataset have another folder as a parent 675 # to get folders if and only if they contain datasets for each folder 676 # check if folder's parent is the project; if so that folder contains a dataset, 677 # unless the folder list has already been filtered to dataset folders based on contentType attribute above 678 679 datasetList = [] 680 folderProperties = ["id", "name"] 681 for folder in list( 682 foldersTable[folderProperties].itertuples(index=False, name=None) 683 ): 684 datasetList.append(folder) 685 686 sorted_dataset_list = sorted(datasetList, key=lambda tup: tup[0]) 687 688 return sorted_dataset_list 689 690 @tracer.start_as_current_span("SynapseStorage::getFilesInStorageDataset") 691 def getFilesInStorageDataset( 692 self, datasetId: str, fileNames: List = None, fullpath: bool = True 693 ) -> List[Tuple[str, str]]: 694 """Gets all files (excluding manifest files) in a given dataset folder. 695 696 Args: 697 datasetId: synapse ID of a storage dataset. 698 fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g. 699 metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present. 700 fullpath: if True return the full path as part of this filename; otherwise return just base filename 701 702 Returns: 703 A list of files; the list consists of tuples (fileId, fileName). 704 705 Raises: 706 ValueError: Dataset ID not found. 707 """ 708 file_list = [] 709 710 # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view 711 if self.storageFileviewTable.empty: 712 raise ValueError( 713 f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again." 714 ) 715 716 child_path = self.storageFileviewTable.loc[ 717 self.storageFileviewTable["parentId"] == datasetId, "path" 718 ] 719 if child_path.empty: 720 raise LookupError( 721 f"Dataset {datasetId} could not be found in fileview {self.storageFileview}." 722 ) 723 child_path = child_path.iloc[0] 724 725 # Get the dataset path by eliminating the child's portion of the path to account for nested datasets 726 parent = child_path.split("/")[:-1] 727 parent = "/".join(parent) 728 729 # Format dataset path to be used in table query 730 dataset_path = f"'{parent}/%'" 731 732 # When querying, only include files to exclude entity files and subdirectories 733 where_clauses = [f"path like {dataset_path}", "type='file'"] 734 735 # Requery the fileview to specifically get the files in the given dataset 736 self.query_fileview(columns=["id", "path"], where_clauses=where_clauses) 737 738 # Exclude manifest files 739 non_manifest_files = self.storageFileviewTable.loc[ 740 ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"), 741 :, 742 ] 743 744 # Remove all files that are not in the list of fileNames 745 if fileNames: 746 filename_regex = "|".join(fileNames) 747 748 matching_files = non_manifest_files["path"].str.contains( 749 filename_regex, case=False, regex=True 750 ) 751 752 non_manifest_files = non_manifest_files.loc[matching_files, :] 753 754 # Truncate path if necessary 755 if not fullpath: 756 non_manifest_files.path = non_manifest_files.path.apply(os.path.basename) 757 758 # Return list of files as expected by other methods 759 file_list = list(non_manifest_files.itertuples(index=False, name=None)) 760 761 return file_list 762 763 def _get_manifest_id(self, manifest: pd.DataFrame) -> str: 764 """If both censored and uncensored manifests are present, return uncensored manifest; if only one manifest is present, return manifest id of that manifest; if more than two manifests are present, return the manifest id of the first one. 765 Args: 766 manifest: a dataframe contains name and id of manifests in a given asset view 767 768 Return: 769 manifest_syn_id: id of a given censored or uncensored manifest 770 """ 771 censored_regex = re.compile(".*censored.*") 772 censored = manifest["name"].str.contains(censored_regex) 773 if any(censored): 774 # Try to use uncensored manifest first 775 not_censored = ~censored 776 if any(not_censored): 777 manifest_syn_id = manifest[not_censored]["id"].iloc[0] 778 # if only censored manifests are available, just use the first censored manifest 779 else: 780 manifest_syn_id = manifest["id"].iloc[0] 781 782 # otherwise, use the first (implied only) version that exists 783 else: 784 manifest_syn_id = manifest["id"].iloc[0] 785 786 return manifest_syn_id 787 788 @tracer.start_as_current_span("SynapseStorage::getDatasetManifest") 789 def getDatasetManifest( 790 self, 791 datasetId: str, 792 downloadFile: bool = False, 793 newManifestName: str = "", 794 use_temporary_folder: bool = True, 795 ) -> Union[str, File]: 796 """Gets the manifest associated with a given dataset. 797 798 Args: 799 datasetId: synapse ID of a storage dataset. 800 downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not. 801 newManifestName: new name of a manifest that gets downloaded 802 use_temporary_folder: boolean argument indicating if a temporary folder 803 should be used to store the manifest file. This is useful when running 804 this code as an API server where multiple requests could be made at the 805 same time. This is set to False when the code is being used from the 806 CLI. Defaults to True. 807 808 Returns: 809 manifest_syn_id (String): Synapse ID of exisiting manifest file. 810 manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True. 811 "" (String): No pre-exisiting manifest in dataset. 812 """ 813 manifest_data = "" 814 815 # get a list of files containing the manifest for this dataset (if any) 816 all_files = self.storageFileviewTable 817 818 # construct regex based on manifest basename in the config 819 manifest_re = re.compile(os.path.basename(self.manifest) + ".*.[tc]sv") 820 821 # search manifest based on given manifest basename regex above 822 # and return a dataframe containing name and id of manifests in a given asset view 823 manifest = all_files[ 824 (all_files["name"].str.contains(manifest_re, regex=True)) 825 & (all_files["parentId"] == datasetId) 826 ] 827 828 manifest = manifest[["id", "name"]] 829 830 # if there is no pre-exisiting manifest in the specified dataset 831 if manifest.empty: 832 logger.warning( 833 f"Could not find a manifest that fits basename {self.manifest} in asset view and dataset {datasetId}" 834 ) 835 return "" 836 837 # if there is an exisiting manifest 838 else: 839 manifest_syn_id = self._get_manifest_id(manifest) 840 if downloadFile: 841 md = ManifestDownload( 842 self.syn, 843 manifest_id=manifest_syn_id, 844 synapse_entity_tracker=self.synapse_entity_tracker, 845 ) 846 manifest_data = md.download_manifest( 847 newManifestName=newManifestName, 848 manifest_df=manifest, 849 use_temporary_folder=use_temporary_folder, 850 ) 851 # TO DO: revisit how downstream code handle manifest_data. If the downstream code would break when manifest_data is an empty string, 852 # then we should catch the error here without returning an empty string. 853 if not manifest_data: 854 logger.debug( 855 f"No manifest data returned. Please check if you have successfully downloaded manifest: {manifest_syn_id}" 856 ) 857 return manifest_data 858 return manifest_syn_id 859 860 def getDataTypeFromManifest(self, manifestId: str): 861 """Fetch a manifest and return data types of all columns 862 Args: 863 manifestId: synapse ID of a manifest 864 """ 865 # get manifest file path 866 manifest_entity = self.synapse_entity_tracker.get( 867 synapse_id=manifestId, syn=self.syn, download_file=True 868 ) 869 manifest_filepath = manifest_entity.path 870 871 # load manifest dataframe 872 manifest = load_df( 873 manifest_filepath, 874 preserve_raw_input=False, 875 data_model=False, 876 ) 877 878 # convert the dataFrame to use best possible dtypes. 879 manifest_new = manifest.convert_dtypes() 880 881 # get data types of columns 882 result = manifest_new.dtypes.to_frame("dtypes").reset_index() 883 884 # return the result as a dictionary 885 result_dict = result.set_index("index")["dtypes"].astype(str).to_dict() 886 887 return result_dict 888 889 def _get_files_metadata_from_dataset( 890 self, datasetId: str, only_new_files: bool, manifest: pd.DataFrame = None 891 ) -> Optional[dict]: 892 """retrieve file ids under a particular datasetId 893 894 Args: 895 datasetId (str): a dataset id 896 only_new_files (bool): if only adding new files that are not already exist 897 manifest (pd.DataFrame): metadata manifest dataframe. Default to None. 898 899 Returns: 900 a dictionary that contains filename and entityid under a given datasetId or None if there is nothing under a given dataset id are not available 901 """ 902 dataset_files = self.getFilesInStorageDataset(datasetId) 903 if dataset_files: 904 dataset_file_names_id_dict = self._get_file_entityIds( 905 dataset_files, only_new_files=only_new_files, manifest=manifest 906 ) 907 return dataset_file_names_id_dict 908 else: 909 return None 910 911 def add_entity_id_and_filename( 912 self, datasetId: str, manifest: pd.DataFrame 913 ) -> pd.DataFrame: 914 """add entityid and filename column to an existing manifest assuming entityId column is not already present 915 916 Args: 917 datasetId (str): dataset syn id 918 manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty 919 920 Returns: 921 pd.DataFrame: returns a pandas dataframe 922 """ 923 # get file names and entity ids of a given dataset 924 dataset_files_dict = self._get_files_metadata_from_dataset( 925 datasetId, only_new_files=False 926 ) 927 928 if dataset_files_dict: 929 # turn manifest dataframe back to a dictionary for operation 930 manifest_dict = manifest.to_dict("list") 931 932 # update Filename column 933 # add entityId column to the end 934 manifest_dict.update(dataset_files_dict) 935 936 # if the component column exists in existing manifest, fill up that column 937 if "Component" in manifest_dict.keys(): 938 manifest_dict["Component"] = manifest_dict["Component"] * max( 939 1, len(manifest_dict["Filename"]) 940 ) 941 942 # turn dictionary back to a dataframe 943 manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient="index") 944 manifest_df_updated = manifest_df_index.transpose() 945 946 # fill na with empty string 947 manifest_df_updated = manifest_df_updated.fillna("") 948 949 # drop index 950 manifest_df_updated = manifest_df_updated.reset_index(drop=True) 951 952 return manifest_df_updated 953 else: 954 return manifest 955 956 def fill_in_entity_id_filename( 957 self, datasetId: str, manifest: pd.DataFrame 958 ) -> Tuple[List, pd.DataFrame]: 959 """fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present. 960 961 Args: 962 datasetId (str): dataset syn id 963 manifest (pd.DataFrame): existing manifest dataframe. 964 965 Returns: 966 Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe 967 """ 968 # get dataset file names and entity id as a list of tuple 969 dataset_files = self.getFilesInStorageDataset(datasetId) 970 971 # update manifest with additional filenames, if any 972 # note that if there is an existing manifest and there are files in the dataset 973 # the columns Filename and entityId are assumed to be present in manifest schema 974 # TODO: use idiomatic panda syntax 975 if not dataset_files: 976 manifest = manifest.fillna("") 977 return dataset_files, manifest 978 979 all_files = self._get_file_entityIds( 980 dataset_files=dataset_files, only_new_files=False, manifest=manifest 981 ) 982 new_files = self._get_file_entityIds( 983 dataset_files=dataset_files, only_new_files=True, manifest=manifest 984 ) 985 986 all_files = pd.DataFrame(all_files) 987 new_files = pd.DataFrame(new_files) 988 989 # update manifest so that it contains new dataset files 990 manifest = ( 991 pd.concat([manifest, new_files], sort=False) 992 .reset_index() 993 .drop("index", axis=1) 994 ) 995 996 # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata 997 manifest_reindex = manifest.set_index("entityId") 998 all_files_reindex = all_files.set_index("entityId") 999 all_files_reindex_like_manifest = all_files_reindex.reindex_like( 1000 manifest_reindex 1001 ) 1002 1003 # Check if individual file paths in manifest and from synapse match 1004 file_paths_match = ( 1005 manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"] 1006 ) 1007 1008 # If all the paths do not match, update the manifest with the filepaths from synapse 1009 if not file_paths_match.all(): 1010 manifest_reindex.loc[ 1011 ~file_paths_match, "Filename" 1012 ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"] 1013 1014 # reformat manifest for further use 1015 manifest = manifest_reindex.reset_index() 1016 entityIdCol = manifest.pop("entityId") 1017 manifest.insert(len(manifest.columns), "entityId", entityIdCol) 1018 1019 manifest = manifest.fillna("") 1020 return dataset_files, manifest 1021 1022 @tracer.start_as_current_span("SynapseStorage::updateDatasetManifestFiles") 1023 def updateDatasetManifestFiles( 1024 self, dmge: DataModelGraphExplorer, datasetId: str, store: bool = True 1025 ) -> Union[Tuple[str, pd.DataFrame], None]: 1026 """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any. 1027 1028 Args: 1029 dmge: DataModelGraphExplorer Instance 1030 datasetId: synapse ID of a storage dataset. 1031 store: if set to True store updated manifest in asset store; if set to False 1032 return a Pandas dataframe containing updated manifest but do not store to asset store 1033 1034 1035 Returns: 1036 Synapse ID of updated manifest and Pandas dataframe containing the updated manifest. 1037 If there is no existing manifest or if the manifest does not have an entityId column, return None 1038 """ 1039 1040 # get existing manifest Synapse ID 1041 manifest_id = self.getDatasetManifest(datasetId) 1042 1043 # if there is no manifest return None 1044 if not manifest_id: 1045 return None 1046 1047 manifest_entity = self.synapse_entity_tracker.get( 1048 synapse_id=manifest_id, syn=self.syn, download_file=True 1049 ) 1050 manifest_filepath = manifest_entity.path 1051 manifest = load_df(manifest_filepath) 1052 1053 # If the manifest does not have an entityId column, trigger a new manifest to be generated 1054 if "entityId" not in manifest.columns: 1055 return None 1056 1057 manifest_is_file_based = "Filename" in manifest.columns 1058 1059 if manifest_is_file_based: 1060 # update manifest with additional filenames, if any 1061 # note that if there is an existing manifest and there are files in the dataset 1062 # the columns Filename and entityId are assumed to be present in manifest schema 1063 # TODO: use idiomatic panda syntax 1064 dataset_files, manifest = self.fill_in_entity_id_filename( 1065 datasetId, manifest 1066 ) 1067 if dataset_files: 1068 # update the manifest file, so that it contains the relevant entity IDs 1069 if store: 1070 manifest.to_csv(manifest_filepath, index=False) 1071 1072 # store manifest and update associated metadata with manifest on Synapse 1073 manifest_id = self.associateMetadataWithFiles( 1074 dmge, manifest_filepath, datasetId 1075 ) 1076 1077 return manifest_id, manifest 1078 1079 def _get_file_entityIds( 1080 self, 1081 dataset_files: List, 1082 only_new_files: bool = False, 1083 manifest: pd.DataFrame = None, 1084 ): 1085 """ 1086 Get a dictionary of files in a dataset. Either files that are not in the current manifest or all files 1087 1088 Args: 1089 manifest: metadata manifest 1090 dataset_file: List of all files in a dataset 1091 only_new_files: boolean to control whether only new files are returned or all files in the dataset 1092 Returns: 1093 files: dictionary of file names and entityIDs, with scope as specified by `only_new_files` 1094 """ 1095 files = {"Filename": [], "entityId": []} 1096 1097 if only_new_files: 1098 if manifest is None: 1099 raise UnboundLocalError( 1100 "No manifest was passed in, a manifest is required when `only_new_files` is True." 1101 ) 1102 1103 if "entityId" not in manifest.columns: 1104 raise ValueError( 1105 "The manifest in your dataset and/or top level folder must contain the 'entityId' column. " 1106 "Please generate an empty manifest without annotations, manually add annotations to the " 1107 "appropriate files in the manifest, and then try again." 1108 ) 1109 1110 # find new files (that are not in the current manifest) if any 1111 for file_id, file_name in dataset_files: 1112 if not file_id in manifest["entityId"].values: 1113 files["Filename"].append(file_name) 1114 files["entityId"].append(file_id) 1115 else: 1116 # get all files 1117 for file_id, file_name in dataset_files: 1118 files["Filename"].append(file_name) 1119 files["entityId"].append(file_id) 1120 1121 return files 1122 1123 @tracer.start_as_current_span("SynapseStorage::getProjectManifests") 1124 def getProjectManifests( 1125 self, projectId: str 1126 ) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]: 1127 """Gets all metadata manifest files across all datasets in a specified project. 1128 1129 Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest 1130 as a list of tuples, one for each manifest: 1131 [ 1132 ( 1133 (datasetId, dataName), 1134 (manifestId, manifestName), 1135 (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema 1136 ), 1137 ... 1138 ] 1139 1140 TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface 1141 """ 1142 component = None 1143 entity = None 1144 manifests = [] 1145 1146 datasets = self.getStorageDatasetsInProject(projectId) 1147 1148 for datasetId, datasetName in datasets: 1149 # encode information about the manifest in a simple list (so that R clients can unpack it) 1150 # eventually can serialize differently 1151 1152 # Get synID of manifest for a dataset 1153 manifestId = self.getDatasetManifest(datasetId) 1154 1155 # If a manifest exists, get the annotations for it, else return base 'manifest' tuple 1156 if manifestId: 1157 annotations = self.getFileAnnotations(manifestId) 1158 1159 # If manifest has annotations specifying component, use that 1160 if annotations and "Component" in annotations: 1161 component = annotations["Component"] 1162 entity = self.synapse_entity_tracker.get( 1163 synapse_id=manifestId, syn=self.syn, download_file=False 1164 ) 1165 manifest_name = entity["properties"]["name"] 1166 1167 # otherwise download the manifest and parse for information 1168 elif not annotations or "Component" not in annotations: 1169 logging.debug( 1170 f"No component annotations have been found for manifest {manifestId}. " 1171 "The manifest will be downloaded and parsed instead. " 1172 "For increased speed, add component annotations to manifest." 1173 ) 1174 1175 manifest_info = self.getDatasetManifest( 1176 datasetId, downloadFile=True 1177 ) 1178 manifest_name = manifest_info["properties"].get("name", "") 1179 1180 if not manifest_name: 1181 logger.error(f"Failed to download manifests from {datasetId}") 1182 1183 manifest_path = manifest_info["path"] 1184 1185 manifest_df = load_df(manifest_path) 1186 1187 # Get component from component column if it exists 1188 if ( 1189 "Component" in manifest_df 1190 and not manifest_df["Component"].empty 1191 ): 1192 list(set(manifest_df["Component"])) 1193 component = list(set(manifest_df["Component"])) 1194 1195 # Added to address issues raised during DCA testing 1196 if "" in component: 1197 component.remove("") 1198 1199 if len(component) == 1: 1200 component = component[0] 1201 elif len(component) > 1: 1202 logging.warning( 1203 f"Manifest {manifestId} is composed of multiple components. Schematic does not support mulit-component manifests at this time." 1204 "Behavior of manifests with multiple components is undefined" 1205 ) 1206 else: 1207 manifest_name = "" 1208 component = None 1209 if component: 1210 manifest = ( 1211 (datasetId, datasetName), 1212 (manifestId, manifest_name), 1213 (component, component), 1214 ) 1215 elif manifestId: 1216 logging.debug( 1217 f"Manifest {manifestId} does not have an associated Component" 1218 ) 1219 manifest = ( 1220 (datasetId, datasetName), 1221 (manifestId, manifest_name), 1222 ("", ""), 1223 ) 1224 else: 1225 manifest = ( 1226 (datasetId, datasetName), 1227 ("", ""), 1228 ("", ""), 1229 ) 1230 1231 if manifest: 1232 manifests.append(manifest) 1233 1234 return manifests 1235 1236 def upload_project_manifests_to_synapse( 1237 self, dmge: DataModelGraphExplorer, projectId: str 1238 ) -> List[str]: 1239 """Upload all metadata manifest files across all datasets in a specified project as tables in Synapse. 1240 1241 Returns: String of all the manifest_table_ids of all the manifests that have been loaded. 1242 """ 1243 1244 manifests = [] 1245 manifest_loaded = [] 1246 datasets = self.getStorageDatasetsInProject(projectId) 1247 1248 for datasetId, datasetName in datasets: 1249 # encode information about the manifest in a simple list (so that R clients can unpack it) 1250 # eventually can serialize differently 1251 1252 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1253 1254 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1255 if manifest_info: 1256 manifest_id = manifest_info["properties"]["id"] 1257 manifest_name = manifest_info["properties"]["name"] 1258 manifest_path = manifest_info["path"] 1259 manifest_df = load_df(manifest_path) 1260 manifest_table_id = uploadDB( 1261 dmge=dmge, 1262 manifest=manifest, 1263 datasetId=datasetId, 1264 table_name=datasetName, 1265 ) 1266 manifest_loaded.append(datasetName) 1267 return manifest_loaded 1268 1269 def upload_annotated_project_manifests_to_synapse( 1270 self, projectId: str, path_to_json_ld: str, dry_run: bool = False 1271 ) -> List[str]: 1272 """ 1273 Purpose: 1274 For all manifests in a project, upload them as a table and add annotations manifest csv. 1275 Assumes the manifest is already present as a CSV in a dataset in the project. 1276 1277 """ 1278 # Instantiate DataModelParser 1279 data_model_parser = DataModelParser(path_to_data_model=path_to_json_ld) 1280 # Parse Model 1281 parsed_data_model = data_model_parser.parse_model() 1282 1283 # Instantiate DataModelGraph 1284 data_model_grapher = DataModelGraph(parsed_data_model) 1285 1286 # Generate graph 1287 graph_data_model = data_model_grapher.generate_data_model_graph() 1288 1289 # Instantiate DataModelGraphExplorer 1290 dmge = DataModelGraphExplorer(graph_data_model) 1291 1292 manifests = [] 1293 manifest_loaded = [] 1294 datasets = self.getStorageDatasetsInProject(projectId) 1295 for datasetId, datasetName in datasets: 1296 # encode information about the manifest in a simple list (so that R clients can unpack it) 1297 # eventually can serialize differently 1298 1299 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1300 manifests.append(manifest) 1301 1302 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1303 1304 if manifest_info: 1305 manifest_id = manifest_info["properties"]["id"] 1306 manifest_name = manifest_info["properties"]["name"] 1307 manifest_path = manifest_info["path"] 1308 manifest = ( 1309 (datasetId, datasetName), 1310 (manifest_id, manifest_name), 1311 ("", ""), 1312 ) 1313 if not dry_run: 1314 self.associateMetadataWithFiles( 1315 dmge, manifest_path, datasetId, manifest_record_type="table" 1316 ) 1317 manifest_loaded.append(manifest) 1318 1319 return manifests, manifest_loaded 1320 1321 def move_entities_to_new_project( 1322 self, 1323 projectId: str, 1324 newProjectId: str, 1325 returnEntities: bool = False, 1326 dry_run: bool = False, 1327 ): 1328 """ 1329 For each manifest csv in a project, look for all the entitiy ids that are associated. 1330 Look up the entitiy in the files, move the entity to new project. 1331 """ 1332 1333 manifests = [] 1334 manifest_loaded = [] 1335 datasets = self.getStorageDatasetsInProject(projectId) 1336 if datasets: 1337 for datasetId, datasetName in datasets: 1338 # encode information about the manifest in a simple list (so that R clients can unpack it) 1339 # eventually can serialize differently 1340 1341 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1342 manifests.append(manifest) 1343 1344 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1345 if manifest_info: 1346 manifest_id = manifest_info["properties"]["id"] 1347 manifest_name = manifest_info["properties"]["name"] 1348 manifest_path = manifest_info["path"] 1349 manifest_df = load_df(manifest_path) 1350 1351 manifest = ( 1352 (datasetId, datasetName), 1353 (manifest_id, manifest_name), 1354 ("", ""), 1355 ) 1356 manifest_loaded.append(manifest) 1357 1358 annotation_entities = self.storageFileviewTable[ 1359 (self.storageFileviewTable["id"].isin(manifest_df["entityId"])) 1360 & (self.storageFileviewTable["type"] == "folder") 1361 ]["id"] 1362 1363 if returnEntities: 1364 for entityId in annotation_entities: 1365 if not dry_run: 1366 moved_entity = self.syn.move(entityId, datasetId) 1367 self.synapse_entity_tracker.add( 1368 synapse_id=moved_entity.id, entity=moved_entity 1369 ) 1370 else: 1371 logging.info( 1372 f"{entityId} will be moved to folder {datasetId}." 1373 ) 1374 else: 1375 # generate project folder 1376 archive_project_folder = Folder( 1377 projectId + "_archive", parent=newProjectId 1378 ) 1379 archive_project_folder = self.syn.store(archive_project_folder) 1380 self.synapse_entity_tracker.add( 1381 synapse_id=archive_project_folder.id, 1382 entity=archive_project_folder, 1383 ) 1384 1385 # generate dataset folder 1386 dataset_archive_folder = Folder( 1387 "_".join([datasetId, datasetName, "archive"]), 1388 parent=archive_project_folder.id, 1389 ) 1390 dataset_archive_folder = self.syn.store(dataset_archive_folder) 1391 self.synapse_entity_tracker.add( 1392 synapse_id=dataset_archive_folder.id, 1393 entity=dataset_archive_folder, 1394 ) 1395 1396 for entityId in annotation_entities: 1397 # move entities to folder 1398 if not dry_run: 1399 moved_entity = self.syn.move( 1400 entityId, dataset_archive_folder.id 1401 ) 1402 self.synapse_entity_tracker.add( 1403 synapse_id=moved_entity.id, entity=moved_entity 1404 ) 1405 else: 1406 logging.info( 1407 f"{entityId} will be moved to folder {dataset_archive_folder.id}." 1408 ) 1409 else: 1410 raise LookupError( 1411 f"No datasets were found in the specified project: {projectId}. Re-check specified master_fileview in CONFIG and retry." 1412 ) 1413 return manifests, manifest_loaded 1414 1415 @tracer.start_as_current_span("SynapseStorage::get_synapse_table") 1416 def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable]: 1417 """Download synapse table as a pd dataframe; return table schema and etags as results too 1418 1419 Args: 1420 synapse_id: synapse ID of the table to query 1421 """ 1422 1423 results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id)) 1424 df = results.asDataFrame( 1425 rowIdAndVersionInIndex=False, 1426 na_values=STR_NA_VALUES_FILTERED, 1427 keep_default_na=False, 1428 ) 1429 1430 return df, results 1431 1432 @missing_entity_handler 1433 @tracer.start_as_current_span("SynapseStorage::uploadDB") 1434 def uploadDB( 1435 self, 1436 dmge: DataModelGraphExplorer, 1437 manifest: pd.DataFrame, 1438 datasetId: str, 1439 table_name: str, 1440 restrict: bool = False, 1441 table_manipulation: str = "replace", 1442 table_column_names: str = "class_label", 1443 ): 1444 """ 1445 Method to upload a database to an asset store. In synapse, this will upload a metadata table 1446 1447 Args: 1448 dmge: DataModelGraphExplorer object 1449 manifest: pd.Df manifest to upload 1450 datasetId: synID of the dataset for the manifest 1451 table_name: name of the table to be uploaded 1452 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 1453 existingTableId: str of the synId of the existing table, if one already exists 1454 table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert) 1455 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 1456 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 1457 display label formatting. 1458 Returns: 1459 manifest_table_id: synID of the uploaded table 1460 manifest: the original manifset 1461 table_manifest: manifest formatted appropriately for the table 1462 1463 """ 1464 1465 col_schema, table_manifest = self.formatDB( 1466 dmge=dmge, manifest=manifest, table_column_names=table_column_names 1467 ) 1468 1469 manifest_table_id = self.buildDB( 1470 datasetId, 1471 table_name, 1472 col_schema, 1473 table_manifest, 1474 table_manipulation, 1475 dmge, 1476 restrict, 1477 ) 1478 1479 return manifest_table_id, manifest, table_manifest 1480 1481 @tracer.start_as_current_span("SynapseStorage::formatDB") 1482 def formatDB(self, dmge, manifest, table_column_names): 1483 """ 1484 Method to format a manifest appropriatly for upload as table 1485 1486 Args: 1487 dmge: DataModelGraphExplorer object 1488 manifest: pd.Df manifest to upload 1489 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 1490 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 1491 display label formatting. 1492 Returns: 1493 col_schema: schema for table columns: type, size, etc 1494 table_manifest: formatted manifest 1495 1496 """ 1497 # Rename the manifest columns to display names to match fileview 1498 1499 blacklist_chars = ["(", ")", ".", " ", "-"] 1500 manifest_columns = manifest.columns.tolist() 1501 1502 table_manifest = deepcopy(manifest) 1503 1504 if table_column_names == "display_name": 1505 cols = table_manifest.columns 1506 1507 elif table_column_names == "display_label": 1508 cols = [ 1509 str(col).translate({ord(x): "" for x in blacklist_chars}) 1510 for col in manifest_columns 1511 ] 1512 1513 elif table_column_names == "class_label": 1514 cols = [ 1515 get_class_label_from_display_name(str(col)).translate( 1516 {ord(x): "" for x in blacklist_chars} 1517 ) 1518 for col in manifest_columns 1519 ] 1520 else: 1521 ValueError( 1522 f"The provided table_column_name: {table_column_names} is not valid, please resubmit with an allowed value only." 1523 ) 1524 1525 cols = list(map(lambda x: x.replace("EntityId", "entityId"), cols)) 1526 1527 # Reset column names in table manifest 1528 table_manifest.columns = cols 1529 1530 # move entity id to end of df 1531 entity_col = table_manifest.pop("entityId") 1532 table_manifest.insert(len(table_manifest.columns), "entityId", entity_col) 1533 1534 # Get the column schema 1535 col_schema = as_table_columns(table_manifest) 1536 1537 # Set Id column length to 64 (for some reason not being auto set.) 1538 for i, col in enumerate(col_schema): 1539 if col["name"].lower() == "id": 1540 col_schema[i]["maximumSize"] = 64 1541 1542 return col_schema, table_manifest 1543 1544 @tracer.start_as_current_span("SynapseStorage::buildDB") 1545 def buildDB( 1546 self, 1547 datasetId: str, 1548 table_name: str, 1549 col_schema: List, 1550 table_manifest: pd.DataFrame, 1551 table_manipulation: str, 1552 dmge: DataModelGraphExplorer, 1553 restrict: bool = False, 1554 ): 1555 """ 1556 Method to construct the table appropriately: create new table, replace existing, or upsert new into existing 1557 Calls TableOperations class to execute 1558 1559 Args: 1560 datasetId: synID of the dataset for the manifest 1561 table_name: name of the table to be uploaded 1562 col_schema: schema for table columns: type, size, etc from `formatDB` 1563 table_manifest: formatted manifest that can be uploaded as a table 1564 table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert) 1565 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 1566 1567 Returns: 1568 manifest_table_id: synID of the uploaded table 1569 1570 """ 1571 table_parent_id = self.getDatasetProject(datasetId=datasetId) 1572 existing_table_id = self.syn.findEntityId( 1573 name=table_name, parent=table_parent_id 1574 ) 1575 1576 tableOps = TableOperations( 1577 synStore=self, 1578 tableToLoad=table_manifest, 1579 tableName=table_name, 1580 datasetId=datasetId, 1581 existingTableId=existing_table_id, 1582 restrict=restrict, 1583 synapse_entity_tracker=self.synapse_entity_tracker, 1584 ) 1585 1586 if not table_manipulation or existing_table_id is None: 1587 manifest_table_id = tableOps.createTable( 1588 columnTypeDict=col_schema, 1589 specifySchema=True, 1590 ) 1591 elif existing_table_id is not None: 1592 if table_manipulation.lower() == "replace": 1593 manifest_table_id = tableOps.replaceTable( 1594 specifySchema=True, 1595 columnTypeDict=col_schema, 1596 ) 1597 elif table_manipulation.lower() == "upsert": 1598 manifest_table_id = tableOps.upsertTable( 1599 dmge=dmge, 1600 ) 1601 elif table_manipulation.lower() == "update": 1602 manifest_table_id = tableOps.updateTable() 1603 1604 if table_manipulation and table_manipulation.lower() == "upsert": 1605 table_entity = self.synapse_entity_tracker.get( 1606 synapse_id=existing_table_id or manifest_table_id, 1607 syn=self.syn, 1608 download_file=False, 1609 ) 1610 annos = OldAnnotations( 1611 id=table_entity.id, 1612 etag=table_entity.etag, 1613 values=table_entity.annotations, 1614 ) 1615 annos["primary_key"] = table_manifest["Component"][0] + "_id" 1616 annos = self.syn.set_annotations(annos) 1617 table_entity.etag = annos.etag 1618 table_entity.annotations = annos 1619 1620 return manifest_table_id 1621 1622 @tracer.start_as_current_span("SynapseStorage::upload_manifest_file") 1623 def upload_manifest_file( 1624 self, 1625 manifest, 1626 metadataManifestPath, 1627 datasetId, 1628 restrict_manifest, 1629 component_name="", 1630 ): 1631 # Update manifest to have the new entityId column 1632 manifest.to_csv(metadataManifestPath, index=False) 1633 1634 # store manifest to Synapse as a CSV 1635 # update file name 1636 file_name_full = metadataManifestPath.split("/")[-1] 1637 file_extension = file_name_full.split(".")[-1] 1638 1639 # Differentiate "censored" and "uncensored" manifest 1640 if "censored" in file_name_full: 1641 file_name_new = ( 1642 os.path.basename(CONFIG.synapse_manifest_basename) 1643 + "_" 1644 + component_name 1645 + "_censored" 1646 + "." 1647 + file_extension 1648 ) 1649 else: 1650 file_name_new = ( 1651 os.path.basename(CONFIG.synapse_manifest_basename) 1652 + "_" 1653 + component_name 1654 + "." 1655 + file_extension 1656 ) 1657 1658 manifest_synapse_file = None 1659 try: 1660 # Rename the file to file_name_new then revert 1661 # This is to maintain the original file name in-case other code is 1662 # expecting that the file exists with the original name 1663 original_file_path = metadataManifestPath 1664 new_file_path = os.path.join( 1665 os.path.dirname(metadataManifestPath), file_name_new 1666 ) 1667 os.rename(original_file_path, new_file_path) 1668 1669 manifest_synapse_file = self._store_file_for_manifest_upload( 1670 new_file_path=new_file_path, 1671 dataset_id=datasetId, 1672 existing_file_name=file_name_full, 1673 file_name_new=file_name_new, 1674 restrict_manifest=restrict_manifest, 1675 ) 1676 manifest_synapse_file_id = manifest_synapse_file.id 1677 1678 finally: 1679 # Revert the file name back to the original 1680 os.rename(new_file_path, original_file_path) 1681 1682 if manifest_synapse_file: 1683 manifest_synapse_file.path = original_file_path 1684 1685 return manifest_synapse_file_id 1686 1687 def _store_file_for_manifest_upload( 1688 self, 1689 new_file_path: str, 1690 dataset_id: str, 1691 existing_file_name: str, 1692 file_name_new: str, 1693 restrict_manifest: bool, 1694 ) -> File: 1695 """Handles a create or update of a manifest file that is going to be uploaded. 1696 If we already have a copy of the Entity in memory we will update that instance, 1697 otherwise create a new File instance to be created in Synapse. Once stored 1698 this will add the file to the `synapse_entity_tracker` for future reference. 1699 1700 Args: 1701 new_file_path (str): The path to the new manifest file 1702 dataset_id (str): The Synapse ID of the dataset the manifest is associated with 1703 existing_file_name (str): The name of the existing file 1704 file_name_new (str): The name of the new file 1705 restrict_manifest (bool): Whether the manifest should be restricted 1706 1707 Returns: 1708 File: The stored manifest file 1709 """ 1710 local_tracked_file_instance = ( 1711 self.synapse_entity_tracker.search_local_by_parent_and_name( 1712 name=existing_file_name, parent_id=dataset_id 1713 ) 1714 or self.synapse_entity_tracker.search_local_by_parent_and_name( 1715 name=file_name_new, parent_id=dataset_id 1716 ) 1717 ) 1718 1719 if local_tracked_file_instance: 1720 local_tracked_file_instance.path = new_file_path 1721 local_tracked_file_instance.description = ( 1722 "Manifest for dataset " + dataset_id 1723 ) 1724 manifest_synapse_file = local_tracked_file_instance 1725 else: 1726 manifest_synapse_file = File( 1727 path=new_file_path, 1728 description="Manifest for dataset " + dataset_id, 1729 parent=dataset_id, 1730 name=file_name_new, 1731 ) 1732 1733 manifest_synapse_file = self.syn.store( 1734 manifest_synapse_file, isRestricted=restrict_manifest 1735 ) 1736 1737 self.synapse_entity_tracker.add( 1738 synapse_id=manifest_synapse_file.id, entity=manifest_synapse_file 1739 ) 1740 return manifest_synapse_file 1741 1742 async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]: 1743 """get annotations asynchronously 1744 1745 Args: 1746 synapse_id (str): synapse id of the entity that the annotation belongs 1747 1748 Returns: 1749 Dict[str, Any]: The requested entity bundle matching 1750 <https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html> 1751 """ 1752 return await get_entity_id_bundle2( 1753 entity_id=synapse_id, 1754 request={"includeAnnotations": True}, 1755 synapse_client=self.syn, 1756 ) 1757 1758 async def store_async_annotation(self, annotation_dict: dict) -> Annotations: 1759 """store annotation in an async way 1760 1761 Args: 1762 annotation_dict (dict): annotation in a dictionary format 1763 1764 Returns: 1765 Annotations: The stored annotations. 1766 """ 1767 annotation_data = Annotations.from_dict( 1768 synapse_annotations=annotation_dict["annotations"]["annotations"] 1769 ) 1770 annotation_class = Annotations( 1771 annotations=annotation_data, 1772 etag=annotation_dict["annotations"]["etag"], 1773 id=annotation_dict["annotations"]["id"], 1774 ) 1775 annotation_storage_result = await annotation_class.store_async( 1776 synapse_client=self.syn 1777 ) 1778 local_entity = self.synapse_entity_tracker.get( 1779 synapse_id=annotation_dict["annotations"]["id"], 1780 syn=self.syn, 1781 download_file=False, 1782 retrieve_if_not_present=False, 1783 ) 1784 if local_entity: 1785 local_entity.etag = annotation_storage_result.etag 1786 local_entity.annotations = annotation_storage_result 1787 return annotation_storage_result 1788 1789 def process_row_annotations( 1790 self, 1791 dmge: DataModelGraphExplorer, 1792 metadata_syn: Dict[str, Any], 1793 hide_blanks: bool, 1794 csv_list_regex: str, 1795 annos: Dict[str, Any], 1796 annotation_keys: str, 1797 ) -> Dict[str, Any]: 1798 """Processes metadata annotations based on the logic below: 1799 1. Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is: 1800 An empty or whitespace-only string. 1801 A NaN value (if the annotation is a float). 1802 if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key. 1803 if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key. 1804 1805 2. If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name". 1806 Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key. 1807 1808 3. For any other conditions, assigns the original value of anno_v to the annotation key (anno_k). 1809 1810 4. Returns the updated annotations dictionary. 1811 1812 Args: 1813 dmge (DataModelGraphExplorer): data model graph explorer 1814 metadata_syn (dict): metadata used for Synapse storage 1815 hideBlanks (bool): if true, does not upload annotation keys with blank values. 1816 csv_list_regex (str): Regex to match with comma separated list 1817 annos (Dict[str, Any]): dictionary of annotation returned from synapse 1818 annotation_keys (str): display_label/class_label 1819 1820 Returns: 1821 Dict[str, Any]: annotations as a dictionary 1822 1823 ```mermaid 1824 flowchart TD 1825 A[Start] --> C{Is anno_v empty, whitespace, or NaN?} 1826 C -- Yes --> D{Is hide_blanks True?} 1827 D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing] 1828 D -- No --> F[Assign empty string to annotation key] 1829 C -- No --> G{Is anno_v a string?} 1830 G -- No --> H[Assign original value of anno_v to annotation key] 1831 G -- Yes --> I{Does anno_v match csv_list_regex?} 1832 I -- Yes --> J[Get validation rule of anno_k] 1833 J --> K{Does the validation rule contain 'list'} 1834 K -- Yes --> L[Split anno_v by commas and assign as list] 1835 I -- No --> H 1836 K -- No --> H 1837 ``` 1838 """ 1839 for anno_k, anno_v in metadata_syn.items(): 1840 # Remove keys with nan or empty string values or string that only contains white space from dict of annotations to be uploaded 1841 # if present on current data annotation 1842 if hide_blanks and ( 1843 (isinstance(anno_v, str) and anno_v.strip() == "") 1844 or (isinstance(anno_v, float) and np.isnan(anno_v)) 1845 ): 1846 annos["annotations"]["annotations"].pop(anno_k) if anno_k in annos[ 1847 "annotations" 1848 ]["annotations"].keys() else annos["annotations"]["annotations"] 1849 continue 1850 1851 # Otherwise save annotation as approrpriate 1852 if isinstance(anno_v, float) and np.isnan(anno_v): 1853 annos["annotations"]["annotations"][anno_k] = "" 1854 continue 1855 1856 # Handle strings that match the csv_list_regex and pass the validation rule 1857 if isinstance(anno_v, str) and re.fullmatch(csv_list_regex, anno_v): 1858 # Use a dictionary to dynamically choose the argument 1859 param = ( 1860 {"node_display_name": anno_k} 1861 if annotation_keys == "display_label" 1862 else {"node_label": anno_k} 1863 ) 1864 node_validation_rules = dmge.get_node_validation_rules(**param) 1865 1866 if rule_in_rule_list("list", node_validation_rules): 1867 annos["annotations"]["annotations"][anno_k] = anno_v.split(",") 1868 continue 1869 # default: assign the original value 1870 annos["annotations"]["annotations"][anno_k] = anno_v 1871 1872 return annos 1873 1874 @async_missing_entity_handler 1875 async def format_row_annotations( 1876 self, 1877 dmge: DataModelGraphExplorer, 1878 row: pd.Series, 1879 entityId: str, 1880 hideBlanks: bool, 1881 annotation_keys: str, 1882 ) -> Union[None, Dict[str, Any]]: 1883 """Format row annotations 1884 1885 Args: 1886 dmge (DataModelGraphExplorer): data moodel graph explorer object 1887 row (pd.Series): row of the manifest 1888 entityId (str): entity id of the manifest 1889 hideBlanks (bool): when true, does not upload annotation keys with blank values. When false, upload Annotation keys with empty string values 1890 annotation_keys (str): display_label/class_label 1891 1892 Returns: 1893 Union[None, Dict[str,]]: if entity id is in trash can, return None. Otherwise, return the annotations 1894 """ 1895 # prepare metadata for Synapse storage (resolve display name into a name that Synapse annotations support (e.g no spaces, parenthesis) 1896 # note: the removal of special characters, will apply only to annotation keys; we are not altering the manifest 1897 # this could create a divergence between manifest column and annotations. this should be ok for most use cases. 1898 # columns with special characters are outside of the schema 1899 metadataSyn = {} 1900 blacklist_chars = ["(", ")", ".", " ", "-"] 1901 1902 for k, v in row.to_dict().items(): 1903 if annotation_keys == "display_label": 1904 keySyn = str(k).translate({ord(x): "" for x in blacklist_chars}) 1905 elif annotation_keys == "class_label": 1906 keySyn = get_class_label_from_display_name(str(k)).translate( 1907 {ord(x): "" for x in blacklist_chars} 1908 ) 1909 1910 # Skip `Filename` and `ETag` columns when setting annotations 1911 if keySyn in ["Filename", "ETag", "eTag"]: 1912 continue 1913 1914 # truncate annotation values to 500 characters if the 1915 # size of values is greater than equal to 500 characters 1916 # add an explicit [truncatedByDataCuratorApp] message at the end 1917 # of every truncated message to indicate that the cell value 1918 # has been truncated 1919 if isinstance(v, str) and len(v) >= 500: 1920 v = v[0:472] + "[truncatedByDataCuratorApp]" 1921 1922 metadataSyn[keySyn] = v 1923 1924 # This will first check if the entity is already in memory, and if so, that 1925 # instance is used. Unfortunately, the expected return format needs to match 1926 # the Synapse API, so we need to convert the annotations to the expected format. 1927 entity = self.synapse_entity_tracker.get( 1928 synapse_id=entityId, 1929 syn=self.syn, 1930 download_file=False, 1931 retrieve_if_not_present=False, 1932 ) 1933 if entity is not None: 1934 synapse_annotations = _convert_to_annotations_list( 1935 annotations=entity.annotations 1936 ) 1937 annos = { 1938 "annotations": { 1939 "id": entity.id, 1940 "etag": entity.etag, 1941 "annotations": synapse_annotations, 1942 } 1943 } 1944 else: 1945 annos = await self.get_async_annotation(entityId) 1946 1947 # set annotation(s) for the various objects/items in a dataset on Synapse 1948 csv_list_regex = comma_separated_list_regex() 1949 1950 annos = self.process_row_annotations( 1951 dmge=dmge, 1952 metadata_syn=metadataSyn, 1953 hide_blanks=hideBlanks, 1954 csv_list_regex=csv_list_regex, 1955 annos=annos, 1956 annotation_keys=annotation_keys, 1957 ) 1958 1959 return annos 1960 1961 @missing_entity_handler 1962 @tracer.start_as_current_span("SynapseStorage::format_manifest_annotations") 1963 def format_manifest_annotations(self, manifest, manifest_synapse_id): 1964 """ 1965 Set annotations for the manifest (as a whole) so they can be applied to the manifest table or csv. 1966 For now just getting the Component. 1967 """ 1968 1969 entity = self.synapse_entity_tracker.get( 1970 synapse_id=manifest_synapse_id, syn=self.syn, download_file=False 1971 ) 1972 is_file = entity.concreteType.endswith(".FileEntity") 1973 is_table = entity.concreteType.endswith(".TableEntity") 1974 1975 if is_file: 1976 # Get file metadata 1977 metadata = self.getFileAnnotations(manifest_synapse_id) 1978 1979 # If there is a defined component add it to the metadata. 1980 if "Component" in manifest.columns: 1981 # Gather component information 1982 component = manifest["Component"].unique() 1983 1984 # Double check that only a single component is listed, else raise an error. 1985 try: 1986 len(component) == 1 1987 except ValueError as err: 1988 raise ValueError( 1989 f"Manifest has more than one component. Please check manifest and resubmit." 1990 ) from err 1991 1992 # Add component to metadata 1993 metadata["Component"] = component[0] 1994 1995 elif is_table: 1996 # Get table metadata 1997 metadata = self.getTableAnnotations(manifest_synapse_id) 1998 1999 # Get annotations 2000 annos = OldAnnotations( 2001 id=entity.id, etag=entity.etag, values=entity.annotations 2002 ) 2003 2004 # Add metadata to the annotations 2005 for annos_k, annos_v in metadata.items(): 2006 annos[annos_k] = annos_v 2007 2008 return annos 2009 2010 ''' 2011 def annotate_upload_manifest_table(self, manifest, datasetId, metadataManifestPath, 2012 useSchemaLabel: bool = True, hideBlanks: bool = False, restrict_manifest = False): 2013 """ 2014 Purpose: 2015 Works very similarly to associateMetadataWithFiles except takes in the manifest 2016 rather than the manifest path 2017 2018 """ 2019 2020 # Add uuid for table updates and fill. 2021 if not "Uuid" in manifest.columns: 2022 manifest["Uuid"] = '' 2023 2024 for idx,row in manifest.iterrows(): 2025 if not row["Uuid"]: 2026 gen_uuid = uuid.uuid4() 2027 row["Uuid"] = gen_uuid 2028 manifest.loc[idx, 'Uuid'] = gen_uuid 2029 2030 # add entityId as a column if not already there or 2031 # fill any blanks with an empty string. 2032 if not "entityId" in manifest.columns: 2033 manifest["entityId"] = "" 2034 else: 2035 manifest["entityId"].fillna("", inplace=True) 2036 2037 # get a DataModelGraphExplorer object to ensure schema attribute names used in manifest are translated to schema labels for synapse annotations 2038 dmge = DataModelGraphExplorer() 2039 2040 # Create table name here. 2041 if 'Component' in manifest.columns: 2042 table_name = manifest['Component'][0].lower() + '_synapse_storage_manifest_table' 2043 else: 2044 table_name = 'synapse_storage_manifest_table' 2045 2046 # Upload manifest as a table and get the SynID and manifest 2047 manifest_synapse_table_id, manifest, table_manifest = self.upload_format_manifest_table( 2048 dmge, manifest, datasetId, table_name, restrict = restrict_manifest, useSchemaLabel=useSchemaLabel,) 2049 2050 # Iterate over manifest rows, create Synapse entities and store corresponding entity IDs in manifest if needed 2051 # also set metadata for each synapse entity as Synapse annotations 2052 for idx, row in manifest.iterrows(): 2053 if not row["entityId"]: 2054 # If not using entityIds, fill with manifest_table_id so 2055 row["entityId"] = manifest_synapse_table_id 2056 entityId = '' 2057 else: 2058 # get the entity id corresponding to this row 2059 entityId = row["entityId"] 2060 2061 # Load manifest to synapse as a CSV File 2062 manifest_synapse_file_id = self.upload_manifest_file(manifest, metadataManifestPath, datasetId, restrict_manifest) 2063 2064 # Get annotations for the file manifest. 2065 manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_file_id) 2066 2067 self.syn.set_annotations(manifest_annotations) 2068 2069 logger.info("Associated manifest file with dataset on Synapse.") 2070 2071 # Update manifest Synapse table with new entity id column. 2072 self.make_synapse_table( 2073 table_to_load = table_manifest, 2074 dataset_id = datasetId, 2075 existingTableId = manifest_synapse_table_id, 2076 table_name = table_name, 2077 update_col = 'Uuid', 2078 specify_schema = False, 2079 ) 2080 2081 # Get annotations for the table manifest 2082 manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_table_id) 2083 self.syn.set_annotations(manifest_annotations) 2084 return manifest_synapse_table_id 2085 ''' 2086 2087 def _read_manifest(self, metadataManifestPath: str) -> pd.DataFrame: 2088 """Helper function to read in provided manifest as a pandas DataFrame for subsequent downstream processing. 2089 Args: 2090 metadataManifestPath (str): path where manifest is stored 2091 Returns: 2092 manifest(pd.DataFrame): Manifest loaded as a pandas dataframe 2093 Raises: 2094 FileNotFoundError: Manifest file does not exist at provided path. 2095 """ 2096 # read new manifest csv 2097 try: 2098 load_args = { 2099 "dtype": "string", 2100 } 2101 manifest = load_df( 2102 metadataManifestPath, 2103 preserve_raw_input=False, 2104 allow_na_values=False, 2105 **load_args, 2106 ) 2107 except FileNotFoundError as err: 2108 raise FileNotFoundError( 2109 f"No manifest file was found at this path: {metadataManifestPath}" 2110 ) from err 2111 return manifest 2112 2113 def _add_id_columns_to_manifest( 2114 self, manifest: pd.DataFrame, dmge: DataModelGraphExplorer 2115 ): 2116 """Helper function to add id and entityId columns to the manifest if they do not already exist, Fill id values per row. 2117 Args: 2118 Manifest loaded as a pd.Dataframe 2119 Returns (pd.DataFrame): 2120 Manifest df with new Id and EntityId columns (and UUID values) if they were not already present. 2121 """ 2122 2123 # Add Id for table updates and fill. 2124 if not col_in_dataframe("Id", manifest): 2125 # See if schema has `Uuid` column specified 2126 try: 2127 uuid_col_in_schema = dmge.is_class_in_schema( 2128 "Uuid" 2129 ) or dmge.is_class_in_schema("uuid") 2130 except KeyError: 2131 uuid_col_in_schema = False 2132 2133 # Rename `Uuid` column if it wasn't specified in the schema 2134 if col_in_dataframe("Uuid", manifest) and not uuid_col_in_schema: 2135 manifest.rename(columns={"Uuid": "Id"}, inplace=True) 2136 # If no `Uuid` column exists or it is specified in the schema, create a new `Id` column 2137 else: 2138 manifest["Id"] = "" 2139 2140 # Retrieve the ID column name (id, Id and ID) are treated the same. 2141 id_col_name = [col for col in manifest.columns if col.lower() == "id"][0] 2142 2143 # Check if values have been added to the Id coulumn, if not add a UUID so value in the row is not blank. 2144 for idx, row in manifest.iterrows(): 2145 if not row[id_col_name]: 2146 gen_uuid = str(uuid.uuid4()) 2147 row[id_col_name] = gen_uuid 2148 manifest.loc[idx, id_col_name] = gen_uuid 2149 2150 # add entityId as a column if not already there or 2151 # fill any blanks with an empty string. 2152 if not col_in_dataframe("entityId", manifest): 2153 manifest["entityId"] = "" 2154 else: 2155 manifest["entityId"].fillna("", inplace=True) 2156 2157 return manifest 2158 2159 def _generate_table_name(self, manifest): 2160 """Helper function to generate a table name for upload to synapse. 2161 2162 Args: 2163 Manifest loaded as a pd.Dataframe 2164 2165 Returns: 2166 table_name (str): Name of the table to load 2167 component_name (str): Name of the manifest component (if applicable) 2168 """ 2169 # Create table name here. 2170 if "Component" in manifest.columns: 2171 component_name = manifest["Component"][0].lower() 2172 table_name = component_name + "_synapse_storage_manifest_table" 2173 else: 2174 component_name = "" 2175 table_name = "synapse_storage_manifest_table" 2176 return table_name, component_name 2177 2178 def _create_entity_id(self, idx, row, manifest, datasetId): 2179 """Helper function to generate an entityId and add it to the appropriate row in the manifest. 2180 Args: 2181 row: current row of manifest being processed 2182 manifest (pd.DataFrame): loaded df containing user supplied data. 2183 datasetId (str): synapse ID of folder containing the dataset 2184 2185 Returns: 2186 manifest (pd.DataFrame): manifest with entityId added to the appropriate row 2187 entityId (str): Generated Entity Id. 2188 2189 """ 2190 rowEntity = Folder(str(uuid.uuid4()), parent=datasetId) 2191 rowEntity = self.syn.store(rowEntity) 2192 entityId = rowEntity["id"] 2193 self.synapse_entity_tracker.add(synapse_id=entityId, entity=rowEntity) 2194 row["entityId"] = entityId 2195 manifest.loc[idx, "entityId"] = entityId 2196 return manifest, entityId 2197 2198 async def _process_store_annos(self, requests: Set[asyncio.Task]) -> None: 2199 """Process annotations and store them on synapse asynchronously 2200 2201 Args: 2202 requests (Set[asyncio.Task]): a set of tasks of formatting annotations created by format_row_annotations function in previous step 2203 2204 Raises: 2205 RuntimeError: raise a run time error if a task failed to complete 2206 """ 2207 while requests: 2208 done_tasks, pending_tasks = await asyncio.wait( 2209 requests, return_when=asyncio.FIRST_COMPLETED 2210 ) 2211 requests = pending_tasks 2212 2213 for completed_task in done_tasks: 2214 try: 2215 annos = completed_task.result() 2216 2217 if isinstance(annos, Annotations): 2218 logger.info(f"Successfully stored annotations for {annos.id}") 2219 else: 2220 # store annotations if they are not None 2221 if annos: 2222 entity_id = annos["annotations"]["id"] 2223 logger.info( 2224 f"Obtained and processed annotations for {entity_id} entity" 2225 ) 2226 requests.add( 2227 asyncio.create_task( 2228 self.store_async_annotation(annotation_dict=annos) 2229 ) 2230 ) 2231 except Exception as e: 2232 raise RuntimeError(f"failed with { repr(e) }.") from e 2233 2234 @tracer.start_as_current_span("SynapseStorage::add_annotations_to_entities_files") 2235 async def add_annotations_to_entities_files( 2236 self, 2237 dmge, 2238 manifest, 2239 manifest_record_type: str, 2240 datasetId: str, 2241 hideBlanks: bool, 2242 manifest_synapse_table_id="", 2243 annotation_keys: str = "class_label", 2244 ): 2245 """ 2246 Depending on upload type add Ids to entityId row. Add anotations to connected 2247 files and folders. Despite the name of this function, it also applies to folders. 2248 2249 Args: 2250 dmge: DataModelGraphExplorer Object 2251 manifest (pd.DataFrame): loaded df containing user supplied data. 2252 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2253 datasetId (str): synapse ID of folder containing the dataset 2254 hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2255 manifest_synapse_table_id (str): Default is an empty string ''. 2256 annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display 2257 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2258 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2259 Returns: 2260 manifest (pd.DataFrame): modified to add entitiyId as appropriate 2261 2262 """ 2263 2264 # Expected behavior is to annotate files if `Filename` is present and if file_annotations_upload is set to True regardless of `-mrt` setting 2265 if "filename" in [col.lower() for col in manifest.columns]: 2266 # get current list of files and store as dataframe 2267 dataset_files = self.getFilesInStorageDataset(datasetId) 2268 files_and_entityIds = self._get_file_entityIds( 2269 dataset_files=dataset_files, only_new_files=False 2270 ) 2271 file_df = pd.DataFrame(files_and_entityIds) 2272 2273 # Merge dataframes to add entityIds 2274 manifest = manifest.merge( 2275 file_df, how="left", on="Filename", suffixes=["_x", None] 2276 ).drop("entityId_x", axis=1) 2277 2278 # Fill `entityId` for each row if missing and annotate entity as appropriate 2279 requests = set() 2280 for idx, row in manifest.iterrows(): 2281 if not row["entityId"] and ( 2282 manifest_record_type == "file_and_entities" 2283 or manifest_record_type == "table_file_and_entities" 2284 ): 2285 manifest, entityId = self._create_entity_id( 2286 idx, row, manifest, datasetId 2287 ) 2288 elif not row["entityId"] and manifest_record_type == "table_and_file": 2289 # If not using entityIds, fill with manifest_table_id so 2290 row["entityId"] = manifest_synapse_table_id 2291 manifest.loc[idx, "entityId"] = manifest_synapse_table_id 2292 entityId = "" 2293 # If the row is the manifest table, do not add annotations 2294 elif row["entityId"] == manifest_synapse_table_id: 2295 entityId = "" 2296 else: 2297 # get the file id of the file to annotate, collected in above step. 2298 entityId = row["entityId"] 2299 2300 # Adding annotations to connected files. 2301 if entityId: 2302 # Format annotations for Synapse 2303 annos_task = asyncio.create_task( 2304 self.format_row_annotations( 2305 dmge, row, entityId, hideBlanks, annotation_keys 2306 ) 2307 ) 2308 requests.add(annos_task) 2309 await self._process_store_annos(requests) 2310 return manifest 2311 2312 @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_table") 2313 def upload_manifest_as_table( 2314 self, 2315 dmge: DataModelGraphExplorer, 2316 manifest: pd.DataFrame, 2317 metadataManifestPath: str, 2318 datasetId: str, 2319 table_name: str, 2320 component_name: str, 2321 restrict: bool, 2322 manifest_record_type: str, 2323 hideBlanks: bool, 2324 table_manipulation: str, 2325 table_column_names: str, 2326 annotation_keys: str, 2327 file_annotations_upload: bool = True, 2328 ): 2329 """Upload manifest to Synapse as a table and csv. 2330 Args: 2331 dmge: DataModelGraphExplorer object 2332 manifest (pd.DataFrame): loaded df containing user supplied data. 2333 metadataManifestPath: path to csv containing a validated metadata manifest. 2334 datasetId (str): synapse ID of folder containing the dataset 2335 table_name (str): Generated to name the table being uploaded. 2336 component_name (str): Name of the component manifest that is currently being uploaded. 2337 restrict (bool): Flag for censored data. 2338 manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2339 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2340 table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2341 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2342 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2343 display label formatting. 2344 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2345 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2346 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2347 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2348 Return: 2349 manifest_synapse_file_id: SynID of manifest csv uploaded to synapse. 2350 """ 2351 # Upload manifest as a table, get the ID and updated manifest. 2352 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2353 dmge=dmge, 2354 manifest=manifest, 2355 datasetId=datasetId, 2356 table_name=table_name, 2357 restrict=restrict, 2358 table_manipulation=table_manipulation, 2359 table_column_names=table_column_names, 2360 ) 2361 2362 if file_annotations_upload: 2363 manifest = asyncio.run( 2364 self.add_annotations_to_entities_files( 2365 dmge, 2366 manifest, 2367 manifest_record_type, 2368 datasetId, 2369 hideBlanks, 2370 manifest_synapse_table_id, 2371 annotation_keys, 2372 ) 2373 ) 2374 # Load manifest to synapse as a CSV File 2375 manifest_synapse_file_id = self.upload_manifest_file( 2376 manifest=manifest, 2377 metadataManifestPath=metadataManifestPath, 2378 datasetId=datasetId, 2379 restrict_manifest=restrict, 2380 component_name=component_name, 2381 ) 2382 2383 # Set annotations for the file manifest. 2384 manifest_annotations = self.format_manifest_annotations( 2385 manifest=manifest, manifest_synapse_id=manifest_synapse_file_id 2386 ) 2387 annos = self.syn.set_annotations(annotations=manifest_annotations) 2388 manifest_entity = self.synapse_entity_tracker.get( 2389 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2390 ) 2391 manifest_entity.annotations = annos 2392 manifest_entity.etag = annos.etag 2393 2394 logger.info("Associated manifest file with dataset on Synapse.") 2395 2396 # Update manifest Synapse table with new entity id column. 2397 manifest_synapse_table_id, manifest, _ = self.uploadDB( 2398 dmge=dmge, 2399 manifest=manifest, 2400 datasetId=datasetId, 2401 table_name=table_name, 2402 restrict=restrict, 2403 table_manipulation="update", 2404 table_column_names=table_column_names, 2405 ) 2406 2407 # Set annotations for the table manifest 2408 manifest_annotations = self.format_manifest_annotations( 2409 manifest=manifest, manifest_synapse_id=manifest_synapse_table_id 2410 ) 2411 annotations_manifest_table = self.syn.set_annotations( 2412 annotations=manifest_annotations 2413 ) 2414 manifest_table_entity = self.synapse_entity_tracker.get( 2415 synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False 2416 ) 2417 manifest_table_entity.annotations = annotations_manifest_table 2418 manifest_table_entity.etag = annotations_manifest_table.etag 2419 2420 return manifest_synapse_file_id 2421 2422 @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_csv") 2423 def upload_manifest_as_csv( 2424 self, 2425 dmge, 2426 manifest, 2427 metadataManifestPath, 2428 datasetId, 2429 restrict, 2430 manifest_record_type, 2431 hideBlanks, 2432 component_name, 2433 annotation_keys: str, 2434 file_annotations_upload: bool = True, 2435 ): 2436 """Upload manifest to Synapse as a csv only. 2437 Args: 2438 dmge: DataModelGraphExplorer object 2439 manifest (pd.DataFrame): loaded df containing user supplied data. 2440 metadataManifestPath: path to csv containing a validated metadata manifest. 2441 datasetId (str): synapse ID of folder containing the dataset 2442 restrict (bool): Flag for censored data. 2443 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2444 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2445 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2446 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2447 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2448 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2449 Return: 2450 manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. 2451 """ 2452 if file_annotations_upload: 2453 manifest = asyncio.run( 2454 self.add_annotations_to_entities_files( 2455 dmge, 2456 manifest, 2457 manifest_record_type, 2458 datasetId, 2459 hideBlanks, 2460 annotation_keys=annotation_keys, 2461 ) 2462 ) 2463 2464 # Load manifest to synapse as a CSV File 2465 manifest_synapse_file_id = self.upload_manifest_file( 2466 manifest, 2467 metadataManifestPath, 2468 datasetId, 2469 restrict, 2470 component_name=component_name, 2471 ) 2472 2473 # Set annotations for the file manifest. 2474 manifest_annotations = self.format_manifest_annotations( 2475 manifest, manifest_synapse_file_id 2476 ) 2477 annos = self.syn.set_annotations(manifest_annotations) 2478 manifest_entity = self.synapse_entity_tracker.get( 2479 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2480 ) 2481 manifest_entity.annotations = annos 2482 manifest_entity.etag = annos.etag 2483 2484 logger.info("Associated manifest file with dataset on Synapse.") 2485 2486 return manifest_synapse_file_id 2487 2488 @tracer.start_as_current_span("SynapseStorage::upload_manifest_combo") 2489 def upload_manifest_combo( 2490 self, 2491 dmge, 2492 manifest, 2493 metadataManifestPath, 2494 datasetId, 2495 table_name, 2496 component_name, 2497 restrict, 2498 manifest_record_type, 2499 hideBlanks, 2500 table_manipulation, 2501 table_column_names: str, 2502 annotation_keys: str, 2503 file_annotations_upload: bool = True, 2504 ): 2505 """Upload manifest to Synapse as a table and CSV with entities. 2506 Args: 2507 dmge: DataModelGraphExplorer object 2508 manifest (pd.DataFrame): loaded df containing user supplied data. 2509 metadataManifestPath: path to csv containing a validated metadata manifest. 2510 datasetId (str): synapse ID of folder containing the dataset 2511 table_name (str): Generated to name the table being uploaded. 2512 component_name (str): Name of the component manifest that is currently being uploaded. 2513 restrict (bool): Flag for censored data. 2514 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2515 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2516 table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2517 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2518 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2519 display label formatting. 2520 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2521 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2522 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2523 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2524 Return: 2525 manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. 2526 """ 2527 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2528 dmge=dmge, 2529 manifest=manifest, 2530 datasetId=datasetId, 2531 table_name=table_name, 2532 restrict=restrict, 2533 table_manipulation=table_manipulation, 2534 table_column_names=table_column_names, 2535 ) 2536 2537 if file_annotations_upload: 2538 manifest = asyncio.run( 2539 self.add_annotations_to_entities_files( 2540 dmge, 2541 manifest, 2542 manifest_record_type, 2543 datasetId, 2544 hideBlanks, 2545 manifest_synapse_table_id, 2546 annotation_keys=annotation_keys, 2547 ) 2548 ) 2549 2550 # Load manifest to synapse as a CSV File 2551 manifest_synapse_file_id = self.upload_manifest_file( 2552 manifest, metadataManifestPath, datasetId, restrict, component_name 2553 ) 2554 2555 # Set annotations for the file manifest. 2556 manifest_annotations = self.format_manifest_annotations( 2557 manifest, manifest_synapse_file_id 2558 ) 2559 file_manifest_annoations = self.syn.set_annotations(manifest_annotations) 2560 manifest_entity = self.synapse_entity_tracker.get( 2561 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2562 ) 2563 manifest_entity.annotations = file_manifest_annoations 2564 manifest_entity.etag = file_manifest_annoations.etag 2565 logger.info("Associated manifest file with dataset on Synapse.") 2566 2567 # Update manifest Synapse table with new entity id column. 2568 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2569 dmge=dmge, 2570 manifest=manifest, 2571 datasetId=datasetId, 2572 table_name=table_name, 2573 restrict=restrict, 2574 table_manipulation="update", 2575 table_column_names=table_column_names, 2576 ) 2577 2578 # Set annotations for the table manifest 2579 manifest_annotations = self.format_manifest_annotations( 2580 manifest, manifest_synapse_table_id 2581 ) 2582 table_manifest_annotations = self.syn.set_annotations(manifest_annotations) 2583 manifest_entity = self.synapse_entity_tracker.get( 2584 synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False 2585 ) 2586 manifest_entity.annotations = table_manifest_annotations 2587 manifest_entity.etag = table_manifest_annotations.etag 2588 return manifest_synapse_file_id 2589 2590 @tracer.start_as_current_span("SynapseStorage::associateMetadataWithFiles") 2591 def associateMetadataWithFiles( 2592 self, 2593 dmge: DataModelGraphExplorer, 2594 metadataManifestPath: str, 2595 datasetId: str, 2596 manifest_record_type: str = "table_file_and_entities", 2597 hideBlanks: bool = False, 2598 restrict_manifest=False, 2599 table_manipulation: str = "replace", 2600 table_column_names: str = "class_label", 2601 annotation_keys: str = "class_label", 2602 file_annotations_upload: bool = True, 2603 ) -> str: 2604 """Associate metadata with files in a storage dataset already on Synapse. 2605 Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file. 2606 2607 If this is a new manifest there could be no Synapse entities associated with the rows of this manifest 2608 this may be due to data type (e.g. clinical data) being tabular 2609 and not requiring files; to utilize uniform interfaces downstream 2610 (i.e. fileviews), a Synapse entity (a folder) is created for each row 2611 and an entity column is added to the manifest containing the resulting 2612 entity IDs; a table is also created at present as an additional interface 2613 for downstream query and interaction with the data. 2614 2615 Args: 2616 dmge: DataModelGraphExplorer Object 2617 metadataManifestPath: path to csv containing a validated metadata manifest. 2618 The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type. 2619 Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item. 2620 In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file. 2621 datasetId: synapse ID of folder containing the dataset 2622 manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination. 2623 hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2624 restrict_manifest (bool): Default is false. Flag for censored data. 2625 table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2626 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2627 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2628 display label formatting. 2629 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2630 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2631 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2632 Returns: 2633 manifest_synapse_file_id: SynID of manifest csv uploaded to synapse. 2634 """ 2635 # Read new manifest CSV: 2636 manifest = self._read_manifest(metadataManifestPath) 2637 manifest = self._add_id_columns_to_manifest(manifest, dmge) 2638 2639 table_name, component_name = self._generate_table_name(manifest) 2640 2641 # Upload manifest to synapse based on user input (manifest_record_type) 2642 if manifest_record_type == "file_only": 2643 manifest_synapse_file_id = self.upload_manifest_as_csv( 2644 dmge=dmge, 2645 manifest=manifest, 2646 metadataManifestPath=metadataManifestPath, 2647 datasetId=datasetId, 2648 restrict=restrict_manifest, 2649 hideBlanks=hideBlanks, 2650 manifest_record_type=manifest_record_type, 2651 component_name=component_name, 2652 annotation_keys=annotation_keys, 2653 file_annotations_upload=file_annotations_upload, 2654 ) 2655 elif manifest_record_type == "table_and_file": 2656 manifest_synapse_file_id = self.upload_manifest_as_table( 2657 dmge=dmge, 2658 manifest=manifest, 2659 metadataManifestPath=metadataManifestPath, 2660 datasetId=datasetId, 2661 table_name=table_name, 2662 component_name=component_name, 2663 restrict=restrict_manifest, 2664 hideBlanks=hideBlanks, 2665 manifest_record_type=manifest_record_type, 2666 table_manipulation=table_manipulation, 2667 table_column_names=table_column_names, 2668 annotation_keys=annotation_keys, 2669 file_annotations_upload=file_annotations_upload, 2670 ) 2671 elif manifest_record_type == "file_and_entities": 2672 manifest_synapse_file_id = self.upload_manifest_as_csv( 2673 dmge=dmge, 2674 manifest=manifest, 2675 metadataManifestPath=metadataManifestPath, 2676 datasetId=datasetId, 2677 restrict=restrict_manifest, 2678 hideBlanks=hideBlanks, 2679 manifest_record_type=manifest_record_type, 2680 component_name=component_name, 2681 annotation_keys=annotation_keys, 2682 file_annotations_upload=file_annotations_upload, 2683 ) 2684 elif manifest_record_type == "table_file_and_entities": 2685 manifest_synapse_file_id = self.upload_manifest_combo( 2686 dmge=dmge, 2687 manifest=manifest, 2688 metadataManifestPath=metadataManifestPath, 2689 datasetId=datasetId, 2690 table_name=table_name, 2691 component_name=component_name, 2692 restrict=restrict_manifest, 2693 hideBlanks=hideBlanks, 2694 manifest_record_type=manifest_record_type, 2695 table_manipulation=table_manipulation, 2696 table_column_names=table_column_names, 2697 annotation_keys=annotation_keys, 2698 file_annotations_upload=file_annotations_upload, 2699 ) 2700 else: 2701 raise ValueError("Please enter a valid manifest_record_type.") 2702 return manifest_synapse_file_id 2703 2704 def getTableAnnotations(self, table_id: str): 2705 """Generate dictionary of annotations for the given Synapse file. 2706 Synapse returns all custom annotations as lists since they 2707 can contain multiple values. In all cases, the values will 2708 be converted into strings and concatenated with ", ". 2709 2710 Args: 2711 fileId (str): Synapse ID for dataset file. 2712 2713 Returns: 2714 dict: Annotations as comma-separated strings. 2715 """ 2716 try: 2717 entity = self.synapse_entity_tracker.get( 2718 synapse_id=table_id, syn=self.syn, download_file=False 2719 ) 2720 is_table = entity.concreteType.endswith(".TableEntity") 2721 annotations_raw = entity.annotations 2722 except SynapseHTTPError: 2723 # If an error occurs with retrieving entity, skip it 2724 # This could be caused by a temporary file view that 2725 # was deleted since its ID was retrieved 2726 is_file, is_table = False, False 2727 2728 # Skip anything that isn't a file or folder 2729 if not (is_table): 2730 return None 2731 2732 annotations = self.getEntityAnnotations(table_id, entity, annotations_raw) 2733 2734 return annotations 2735 2736 def getFileAnnotations(self, fileId: str) -> Dict[str, str]: 2737 """Generate dictionary of annotations for the given Synapse file. 2738 Synapse returns all custom annotations as lists since they 2739 can contain multiple values. In all cases, the values will 2740 be converted into strings and concatenated with ", ". 2741 2742 Args: 2743 fileId (str): Synapse ID for dataset file. 2744 2745 Returns: 2746 dict: Annotations as comma-separated strings. 2747 """ 2748 2749 # Get entity metadata, including annotations 2750 try: 2751 entity = self.synapse_entity_tracker.get( 2752 synapse_id=fileId, syn=self.syn, download_file=False 2753 ) 2754 is_file = entity.concreteType.endswith(".FileEntity") 2755 is_folder = entity.concreteType.endswith(".Folder") 2756 annotations_raw = entity.annotations 2757 except SynapseHTTPError: 2758 # If an error occurs with retrieving entity, skip it 2759 # This could be caused by a temporary file view that 2760 # was deleted since its ID was retrieved 2761 is_file, is_folder = False, False 2762 2763 # Skip anything that isn't a file or folder 2764 if not (is_file or is_folder): 2765 return None 2766 2767 annotations = self.getEntityAnnotations(fileId, entity, annotations_raw) 2768 2769 return annotations 2770 2771 def getEntityAnnotations(self, fileId, entity, annotations_raw): 2772 # Extract annotations from their lists and stringify. For example: 2773 # {'YearofBirth': [1980], 'author': ['bruno', 'milen', 'sujay']} 2774 annotations = dict() 2775 for key, vals in annotations_raw.items(): 2776 if isinstance(vals, list) and len(vals) == 1: 2777 annotations[key] = str(vals[0]) 2778 else: 2779 annotations[key] = ", ".join(str(v) for v in vals) 2780 2781 # Add the file entity ID and eTag, which weren't lists 2782 assert fileId == entity.id, ( 2783 "For some reason, the Synapse ID in the response doesn't match" 2784 "the Synapse ID sent in the request (via synapseclient)." 2785 ) 2786 annotations["entityId"] = fileId 2787 annotations["eTag"] = entity.etag 2788 2789 return annotations 2790 2791 def getDatasetAnnotations( 2792 self, datasetId: str, fill_na: bool = True, force_batch: bool = False 2793 ) -> pd.DataFrame: 2794 """Generate table for annotations across all files in given dataset. 2795 2796 Args: 2797 datasetId (str): Synapse ID for dataset folder. 2798 fill_na (bool): Whether to replace missing values with 2799 blank strings. 2800 force_batch (bool): Whether to force the function to use 2801 the batch mode, which uses a file view to retrieve 2802 annotations for a given dataset. Default to False 2803 unless there are more than 50 files in the dataset. 2804 2805 Returns: 2806 pd.DataFrame: Table of annotations. 2807 """ 2808 # Get all files in given dataset 2809 dataset_files = self.getFilesInStorageDataset(datasetId) 2810 2811 # if there are no dataset files, there are no annotations 2812 # return None 2813 if not dataset_files: 2814 return pd.DataFrame() 2815 2816 dataset_files_map = dict(dataset_files) 2817 dataset_file_ids, _ = list(zip(*dataset_files)) 2818 2819 # Get annotations for each file from Step 1 2820 # Batch mode 2821 try_batch = len(dataset_files) >= 50 or force_batch 2822 if try_batch: 2823 try: 2824 logger.info("Trying batch mode for retrieving Synapse annotations") 2825 table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids) 2826 except (SynapseAuthenticationError, SynapseHTTPError, ValueError): 2827 logger.info( 2828 f"Unable to create a temporary file view bound to {datasetId}. " 2829 "Defaulting to slower iterative retrieval of annotations." 2830 ) 2831 # Default to the slower non-batch method 2832 logger.info("Batch mode failed (probably due to permission error)") 2833 try_batch = False 2834 2835 # Non-batch mode 2836 if not try_batch: 2837 logger.info("Using slower (non-batch) sequential mode") 2838 records = [self.getFileAnnotations(i) for i in dataset_file_ids] 2839 # Remove any annotations for non-file/folders (stored as None) 2840 records = filter(None, records) 2841 table = pd.DataFrame.from_records(records) 2842 2843 # Add filenames for the files that "survived" annotation retrieval 2844 filenames = [dataset_files_map[i] for i in table["entityId"]] 2845 2846 if "Filename" not in table.columns: 2847 table.insert(0, "Filename", filenames) 2848 2849 # Ensure that entityId and eTag are at the end 2850 entity_ids = table.pop("entityId") 2851 etags = table.pop("eTag") 2852 table.insert(len(table.columns), "entityId", entity_ids) 2853 table.insert(len(table.columns), "eTag", etags) 2854 2855 # Missing values are filled in with empty strings for Google Sheets 2856 if fill_na: 2857 table.fillna("", inplace=True) 2858 2859 # Force all values as strings 2860 return table.astype(str) 2861 2862 def raise_final_error(retry_state): 2863 return retry_state.outcome.result() 2864 2865 def checkIfinAssetView(self, syn_id) -> str: 2866 # get data in administrative fileview for this pipeline 2867 assetViewTable = self.getStorageFileviewTable() 2868 all_files = list(assetViewTable["id"]) 2869 if syn_id in all_files: 2870 return True 2871 else: 2872 return False 2873 2874 @tracer.start_as_current_span("SynapseStorage::getDatasetProject") 2875 @retry( 2876 stop=stop_after_attempt(5), 2877 wait=wait_chain( 2878 *[wait_fixed(10) for i in range(2)] 2879 + [wait_fixed(15) for i in range(2)] 2880 + [wait_fixed(20)] 2881 ), 2882 retry=retry_if_exception_type(LookupError), 2883 retry_error_callback=raise_final_error, 2884 ) 2885 def getDatasetProject(self, datasetId: str) -> str: 2886 """Get parent project for a given dataset ID. 2887 2888 Args: 2889 datasetId (str): Synapse entity ID (folder or project). 2890 2891 Raises: 2892 ValueError: Raised if Synapse ID cannot be retrieved 2893 by the user or if it doesn't appear in the file view. 2894 2895 Returns: 2896 str: The Synapse ID for the parent project. 2897 """ 2898 2899 # Subset main file view 2900 dataset_index = self.storageFileviewTable["id"] == datasetId 2901 dataset_row = self.storageFileviewTable[dataset_index] 2902 2903 # re-query if no datasets found 2904 if dataset_row.empty: 2905 sleep(5) 2906 self.query_fileview(force_requery=True) 2907 # Subset main file view 2908 dataset_index = self.storageFileviewTable["id"] == datasetId 2909 dataset_row = self.storageFileviewTable[dataset_index] 2910 2911 # Return `projectId` for given row if only one found 2912 if len(dataset_row) == 1: 2913 dataset_project = dataset_row["projectId"].values[0] 2914 return dataset_project 2915 2916 # Otherwise, check if already project itself 2917 try: 2918 syn_object = self.synapse_entity_tracker.get( 2919 synapse_id=datasetId, syn=self.syn, download_file=False 2920 ) 2921 if syn_object.properties["concreteType"].endswith("Project"): 2922 return datasetId 2923 except SynapseHTTPError: 2924 raise PermissionError( 2925 f"The given dataset ({datasetId}) isn't accessible with this " 2926 "user. This might be caused by a typo in the dataset Synapse ID." 2927 ) 2928 2929 # If not, then assume dataset not in file view 2930 raise LookupError( 2931 f"The given dataset ({datasetId}) doesn't appear in the " 2932 f"configured file view ({self.storageFileview}). This might " 2933 "mean that the file view's scope needs to be updated." 2934 ) 2935 2936 def getDatasetAnnotationsBatch( 2937 self, datasetId: str, dataset_file_ids: Sequence[str] = None 2938 ) -> pd.DataFrame: 2939 """Generate table for annotations across all files in given dataset. 2940 This function uses a temporary file view to generate a table 2941 instead of iteratively querying for individual entity annotations. 2942 This function is expected to run much faster than 2943 `self.getDatasetAnnotationsBatch` on large datasets. 2944 2945 Args: 2946 datasetId (str): Synapse ID for dataset folder. 2947 dataset_file_ids (Sequence[str]): List of Synapse IDs 2948 for dataset files/folders used to subset the table. 2949 2950 Returns: 2951 pd.DataFrame: Table of annotations. 2952 """ 2953 # Create data frame from annotations file view 2954 with DatasetFileView(datasetId, self.syn) as fileview: 2955 table = fileview.query() 2956 2957 if dataset_file_ids: 2958 table = table.loc[table.index.intersection(dataset_file_ids)] 2959 2960 table = table.reset_index(drop=True) 2961 2962 return table 2963 2964 def _get_table_schema_by_cname(self, table_schema): 2965 # assume no duplicate column names in the table 2966 table_schema_by_cname = {} 2967 2968 for col_record in table_schema: 2969 # TODO clean up dictionary for compactness (e.g. remove redundant 'name' key) 2970 table_schema_by_cname[col_record["name"]] = col_record 2971 2972 return table_schema_by_cname
Implementation of Storage interface for datasets/files stored on Synapse. Provides utilities to list files in a specific project; update files annotations, create fileviews, etc.
TODO: Need to define the interface and rename and/or refactor some of the methods below.
298 @tracer.start_as_current_span("SynapseStorage::__init__") 299 def __init__( 300 self, 301 token: Optional[str] = None, # optional parameter retrieved from browser cookie 302 access_token: Optional[str] = None, 303 project_scope: Optional[list] = None, 304 synapse_cache_path: Optional[str] = None, 305 perform_query: Optional[bool] = True, 306 columns: Optional[list] = None, 307 where_clauses: Optional[list] = None, 308 ) -> None: 309 """Initializes a SynapseStorage object. 310 311 Args: 312 token (Optional[str], optional): 313 Optional token parameter as found in browser cookie upon login to synapse. 314 Defaults to None. 315 access_token (Optional[list], optional): 316 Optional access token (personal or oauth). 317 Defaults to None. 318 project_scope (Optional[list], optional): Defaults to None. 319 synapse_cache_path (Optional[str], optional): 320 Location of synapse cache. 321 Defaults to None. 322 TODO: 323 Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how `query_fileview` is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands. 324 """ 325 self.syn = self.login(synapse_cache_path, access_token) 326 self.project_scope = project_scope 327 self.storageFileview = CONFIG.synapse_master_fileview_id 328 self.manifest = CONFIG.synapse_manifest_basename 329 self.root_synapse_cache = self.syn.cache.cache_root_dir 330 self.synapse_entity_tracker = SynapseEntityTracker() 331 if perform_query: 332 self.query_fileview(columns=columns, where_clauses=where_clauses)
Initializes a SynapseStorage object.
Arguments:
- token (Optional[str], optional): Optional token parameter as found in browser cookie upon login to synapse. Defaults to None.
- access_token (Optional[list], optional): Optional access token (personal or oauth). Defaults to None.
- project_scope (Optional[list], optional): Defaults to None.
- synapse_cache_path (Optional[str], optional): Location of synapse cache. Defaults to None.
TODO:
Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how
query_fileview
is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands.
371 @tracer.start_as_current_span("SynapseStorage::query_fileview") 372 def query_fileview( 373 self, 374 columns: Optional[list] = None, 375 where_clauses: Optional[list] = None, 376 force_requery: Optional[bool] = False, 377 ) -> None: 378 """ 379 Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute. 380 Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes. 381 Args: 382 columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns. 383 where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None. 384 force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False. 385 """ 386 self._purge_synapse_cache() 387 388 # Initialize to assume that the new fileview query will be different from what may already be stored. Initializes to True because generally one will not have already been performed 389 self.new_query_different = True 390 391 # If a query has already been performed, store the query 392 previous_query_built = hasattr(self, "fileview_query") 393 if previous_query_built: 394 previous_query = self.fileview_query 395 396 # Build a query with the current given parameters and check to see if it is different from the previous 397 self._build_query(columns=columns, where_clauses=where_clauses) 398 if previous_query_built: 399 self.new_query_different = self.fileview_query != previous_query 400 401 # Only perform the query if it is different from the previous query or we are forcing new results to be retrieved 402 if self.new_query_different or force_requery: 403 try: 404 self.storageFileviewTable = self.syn.tableQuery( 405 query=self.fileview_query, 406 ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) 407 except SynapseHTTPError as exc: 408 exception_text = str(exc) 409 if "Unknown column path" in exception_text: 410 raise ValueError( 411 "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation." 412 ) 413 elif "Unknown column" in exception_text: 414 missing_column = exception_text.split("Unknown column ")[-1] 415 raise ValueError( 416 f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview." 417 ) 418 else: 419 raise AccessCredentialsError(self.storageFileview)
Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute. Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes.
Arguments:
- columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
- where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
- force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False.
421 @staticmethod 422 def build_clause_from_dataset_id( 423 dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None 424 ) -> str: 425 """ 426 Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized. 427 Args: 428 dataset_id: Synapse ID of a dataset that should be used to limit the query 429 dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query 430 Returns: 431 clause for the query or an empty string if no dataset ID is provided 432 """ 433 # Calling this method without specifying synIDs will complete but will not scope the view 434 if (not dataset_id) and (not dataset_folder_list): 435 return "" 436 437 # This will be used to gather files under a dataset recursively with a fileview query instead of walking 438 if dataset_folder_list: 439 search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list) 440 return f"parentId IN ({search_folders})" 441 442 # `dataset_id` should be provided when all files are stored directly under the dataset folder 443 return f"parentId='{dataset_id}'"
Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized.
Arguments:
- dataset_id: Synapse ID of a dataset that should be used to limit the query
- dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query
Returns:
clause for the query or an empty string if no dataset ID is provided
483 @staticmethod 484 @tracer.start_as_current_span("SynapseStorage::login") 485 def login( 486 synapse_cache_path: Optional[str] = None, 487 access_token: Optional[str] = None, 488 ) -> synapseclient.Synapse: 489 """Login to Synapse 490 491 Args: 492 access_token (Optional[str], optional): A synapse access token. Defaults to None. 493 synapse_cache_path (Optional[str]): location of synapse cache 494 495 Raises: 496 ValueError: If unable to loging with access token 497 498 Returns: 499 synapseclient.Synapse: A Synapse object that is logged in 500 """ 501 if not access_token: 502 access_token = os.getenv("SYNAPSE_ACCESS_TOKEN") 503 504 # login using a token 505 if access_token: 506 try: 507 syn = synapseclient.Synapse( 508 cache_root_dir=synapse_cache_path, 509 debug=False, 510 skip_checks=True, 511 cache_client=False, 512 ) 513 syn.login(authToken=access_token, silent=True) 514 except SynapseHTTPError as exc: 515 raise ValueError( 516 "No access to resources. Please make sure that your token is correct" 517 ) from exc 518 else: 519 # login using synapse credentials provided by user in .synapseConfig (default) file 520 syn = synapseclient.Synapse( 521 configPath=CONFIG.synapse_configuration_path, 522 cache_root_dir=synapse_cache_path, 523 debug=False, 524 skip_checks=True, 525 cache_client=False, 526 ) 527 syn.login(silent=True) 528 529 # set user id attribute 530 current_span = trace.get_current_span() 531 if current_span.is_recording(): 532 current_span.set_attribute("user.id", syn.credentials.owner_id) 533 534 return syn
Login to Synapse
Arguments:
- access_token (Optional[str], optional): A synapse access token. Defaults to None.
- synapse_cache_path (Optional[str]): location of synapse cache
Raises:
- ValueError: If unable to loging with access token
Returns:
synapseclient.Synapse: A Synapse object that is logged in
536 def missing_entity_handler(method): 537 def wrapper(*args, **kwargs): 538 try: 539 return method(*args, **kwargs) 540 except SynapseHTTPError as ex: 541 str_message = str(ex).replace("\n", "") 542 if "trash" in str_message or "does not exist" in str_message: 543 logging.warning(str_message) 544 return None 545 else: 546 raise ex 547 548 return wrapper
550 def async_missing_entity_handler(method): 551 """Decorator to handle missing entities in async methods.""" 552 553 async def wrapper(*args: Any, **kwargs: Any) -> Any: 554 try: 555 return await method(*args, **kwargs) 556 except SynapseHTTPError as ex: 557 str_message = str(ex).replace("\n", "") 558 if "trash" in str_message or "does not exist" in str_message: 559 logging.warning(str_message) 560 return None 561 else: 562 raise ex 563 564 return wrapper
Decorator to handle missing entities in async methods.
566 def getStorageFileviewTable(self): 567 """Returns the storageFileviewTable obtained during initialization.""" 568 return self.storageFileviewTable
Returns the storageFileviewTable obtained during initialization.
570 def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]: 571 """Gets the paginated results of the REST call to Synapse to check what projects the current user has access to. 572 573 Args: 574 currentUserId: synapse id for the user whose projects we want to get. 575 576 Returns: 577 A dictionary with a next page token and the results. 578 """ 579 all_results = self.syn.restGET( 580 "/projects/user/{principalId}".format(principalId=currentUserId) 581 ) 582 583 while ( 584 "nextPageToken" in all_results 585 ): # iterate over next page token in results while there is any 586 results_token = self.syn.restGET( 587 "/projects/user/{principalId}?nextPageToken={nextPageToken}".format( 588 principalId=currentUserId, 589 nextPageToken=all_results["nextPageToken"], 590 ) 591 ) 592 all_results["results"].extend(results_token["results"]) 593 594 if "nextPageToken" in results_token: 595 all_results["nextPageToken"] = results_token["nextPageToken"] 596 else: 597 del all_results["nextPageToken"] 598 599 return all_results
Gets the paginated results of the REST call to Synapse to check what projects the current user has access to.
Arguments:
- currentUserId: synapse id for the user whose projects we want to get.
Returns:
A dictionary with a next page token and the results.
601 @tracer.start_as_current_span("SynapseStorage::getStorageProjects") 602 def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]: 603 """Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute. 604 605 Returns: 606 A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName). 607 """ 608 609 # get the set of all storage Synapse project accessible for this pipeline 610 storageProjects = self.storageFileviewTable["projectId"].unique() 611 612 # get the set of storage Synapse project accessible for this user 613 # get a list of projects from Synapse 614 current_user_project_headers = self.synapse_entity_tracker.get_project_headers( 615 current_user_id=self.syn.credentials.owner_id, syn=self.syn 616 ) 617 project_id_to_name_dict = {} 618 current_user_projects = [] 619 for project_header in current_user_project_headers: 620 project_id_to_name_dict[project_header.get("id")] = project_header.get( 621 "name" 622 ) 623 current_user_projects.append(project_header.get("id")) 624 625 # find set of user projects that are also in this pipeline's storage projects set 626 storageProjects = list(set(storageProjects) & set(current_user_projects)) 627 628 # Limit projects to scope if specified 629 if project_scope: 630 storageProjects = list(set(storageProjects) & set(project_scope)) 631 632 if not storageProjects: 633 raise Warning( 634 f"There are no projects that the user has access to that match the criteria of the specified project scope: {project_scope}" 635 ) 636 637 # prepare a return list of project IDs and names 638 projects = [] 639 for projectId in storageProjects: 640 project_name_from_project_header = project_id_to_name_dict.get(projectId) 641 projects.append((projectId, project_name_from_project_header)) 642 643 sorted_projects_list = sorted(projects, key=lambda tup: tup[0]) 644 645 return sorted_projects_list
Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute.
Returns:
A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName).
647 @tracer.start_as_current_span("SynapseStorage::getStorageDatasetsInProject") 648 def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]: 649 """Gets all datasets in folder under a given storage project that the current user has access to. 650 651 Args: 652 projectId: synapse ID of a storage project. 653 654 Returns: 655 A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName). 656 None: If the projectId cannot be found on Synapse. 657 """ 658 659 # select all folders and fetch their names from within the storage project; 660 # if folder content type is defined, only select folders that contain datasets 661 if "contentType" in self.storageFileviewTable.columns: 662 foldersTable = self.storageFileviewTable[ 663 (self.storageFileviewTable["contentType"] == "dataset") 664 & (self.storageFileviewTable["projectId"] == projectId) 665 ] 666 else: 667 foldersTable = self.storageFileviewTable[ 668 (self.storageFileviewTable["type"] == "folder") 669 & (self.storageFileviewTable["parentId"] == projectId) 670 ] 671 672 # get an array of tuples (folderId, folderName) 673 # some folders are part of datasets; others contain datasets 674 # each dataset parent is the project; folders part of a dataset have another folder as a parent 675 # to get folders if and only if they contain datasets for each folder 676 # check if folder's parent is the project; if so that folder contains a dataset, 677 # unless the folder list has already been filtered to dataset folders based on contentType attribute above 678 679 datasetList = [] 680 folderProperties = ["id", "name"] 681 for folder in list( 682 foldersTable[folderProperties].itertuples(index=False, name=None) 683 ): 684 datasetList.append(folder) 685 686 sorted_dataset_list = sorted(datasetList, key=lambda tup: tup[0]) 687 688 return sorted_dataset_list
Gets all datasets in folder under a given storage project that the current user has access to.
Arguments:
- projectId: synapse ID of a storage project.
Returns:
A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName). None: If the projectId cannot be found on Synapse.
690 @tracer.start_as_current_span("SynapseStorage::getFilesInStorageDataset") 691 def getFilesInStorageDataset( 692 self, datasetId: str, fileNames: List = None, fullpath: bool = True 693 ) -> List[Tuple[str, str]]: 694 """Gets all files (excluding manifest files) in a given dataset folder. 695 696 Args: 697 datasetId: synapse ID of a storage dataset. 698 fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g. 699 metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present. 700 fullpath: if True return the full path as part of this filename; otherwise return just base filename 701 702 Returns: 703 A list of files; the list consists of tuples (fileId, fileName). 704 705 Raises: 706 ValueError: Dataset ID not found. 707 """ 708 file_list = [] 709 710 # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view 711 if self.storageFileviewTable.empty: 712 raise ValueError( 713 f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again." 714 ) 715 716 child_path = self.storageFileviewTable.loc[ 717 self.storageFileviewTable["parentId"] == datasetId, "path" 718 ] 719 if child_path.empty: 720 raise LookupError( 721 f"Dataset {datasetId} could not be found in fileview {self.storageFileview}." 722 ) 723 child_path = child_path.iloc[0] 724 725 # Get the dataset path by eliminating the child's portion of the path to account for nested datasets 726 parent = child_path.split("/")[:-1] 727 parent = "/".join(parent) 728 729 # Format dataset path to be used in table query 730 dataset_path = f"'{parent}/%'" 731 732 # When querying, only include files to exclude entity files and subdirectories 733 where_clauses = [f"path like {dataset_path}", "type='file'"] 734 735 # Requery the fileview to specifically get the files in the given dataset 736 self.query_fileview(columns=["id", "path"], where_clauses=where_clauses) 737 738 # Exclude manifest files 739 non_manifest_files = self.storageFileviewTable.loc[ 740 ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"), 741 :, 742 ] 743 744 # Remove all files that are not in the list of fileNames 745 if fileNames: 746 filename_regex = "|".join(fileNames) 747 748 matching_files = non_manifest_files["path"].str.contains( 749 filename_regex, case=False, regex=True 750 ) 751 752 non_manifest_files = non_manifest_files.loc[matching_files, :] 753 754 # Truncate path if necessary 755 if not fullpath: 756 non_manifest_files.path = non_manifest_files.path.apply(os.path.basename) 757 758 # Return list of files as expected by other methods 759 file_list = list(non_manifest_files.itertuples(index=False, name=None)) 760 761 return file_list
Gets all files (excluding manifest files) in a given dataset folder.
Arguments:
- datasetId: synapse ID of a storage dataset.
- fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g.
- metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present.
- fullpath: if True return the full path as part of this filename; otherwise return just base filename
Returns:
A list of files; the list consists of tuples (fileId, fileName).
Raises:
- ValueError: Dataset ID not found.
788 @tracer.start_as_current_span("SynapseStorage::getDatasetManifest") 789 def getDatasetManifest( 790 self, 791 datasetId: str, 792 downloadFile: bool = False, 793 newManifestName: str = "", 794 use_temporary_folder: bool = True, 795 ) -> Union[str, File]: 796 """Gets the manifest associated with a given dataset. 797 798 Args: 799 datasetId: synapse ID of a storage dataset. 800 downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not. 801 newManifestName: new name of a manifest that gets downloaded 802 use_temporary_folder: boolean argument indicating if a temporary folder 803 should be used to store the manifest file. This is useful when running 804 this code as an API server where multiple requests could be made at the 805 same time. This is set to False when the code is being used from the 806 CLI. Defaults to True. 807 808 Returns: 809 manifest_syn_id (String): Synapse ID of exisiting manifest file. 810 manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True. 811 "" (String): No pre-exisiting manifest in dataset. 812 """ 813 manifest_data = "" 814 815 # get a list of files containing the manifest for this dataset (if any) 816 all_files = self.storageFileviewTable 817 818 # construct regex based on manifest basename in the config 819 manifest_re = re.compile(os.path.basename(self.manifest) + ".*.[tc]sv") 820 821 # search manifest based on given manifest basename regex above 822 # and return a dataframe containing name and id of manifests in a given asset view 823 manifest = all_files[ 824 (all_files["name"].str.contains(manifest_re, regex=True)) 825 & (all_files["parentId"] == datasetId) 826 ] 827 828 manifest = manifest[["id", "name"]] 829 830 # if there is no pre-exisiting manifest in the specified dataset 831 if manifest.empty: 832 logger.warning( 833 f"Could not find a manifest that fits basename {self.manifest} in asset view and dataset {datasetId}" 834 ) 835 return "" 836 837 # if there is an exisiting manifest 838 else: 839 manifest_syn_id = self._get_manifest_id(manifest) 840 if downloadFile: 841 md = ManifestDownload( 842 self.syn, 843 manifest_id=manifest_syn_id, 844 synapse_entity_tracker=self.synapse_entity_tracker, 845 ) 846 manifest_data = md.download_manifest( 847 newManifestName=newManifestName, 848 manifest_df=manifest, 849 use_temporary_folder=use_temporary_folder, 850 ) 851 # TO DO: revisit how downstream code handle manifest_data. If the downstream code would break when manifest_data is an empty string, 852 # then we should catch the error here without returning an empty string. 853 if not manifest_data: 854 logger.debug( 855 f"No manifest data returned. Please check if you have successfully downloaded manifest: {manifest_syn_id}" 856 ) 857 return manifest_data 858 return manifest_syn_id
Gets the manifest associated with a given dataset.
Arguments:
- datasetId: synapse ID of a storage dataset.
- downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not.
- newManifestName: new name of a manifest that gets downloaded
- use_temporary_folder: boolean argument indicating if a temporary folder should be used to store the manifest file. This is useful when running this code as an API server where multiple requests could be made at the same time. This is set to False when the code is being used from the CLI. Defaults to True.
Returns:
manifest_syn_id (String): Synapse ID of exisiting manifest file. manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True. "" (String): No pre-exisiting manifest in dataset.
860 def getDataTypeFromManifest(self, manifestId: str): 861 """Fetch a manifest and return data types of all columns 862 Args: 863 manifestId: synapse ID of a manifest 864 """ 865 # get manifest file path 866 manifest_entity = self.synapse_entity_tracker.get( 867 synapse_id=manifestId, syn=self.syn, download_file=True 868 ) 869 manifest_filepath = manifest_entity.path 870 871 # load manifest dataframe 872 manifest = load_df( 873 manifest_filepath, 874 preserve_raw_input=False, 875 data_model=False, 876 ) 877 878 # convert the dataFrame to use best possible dtypes. 879 manifest_new = manifest.convert_dtypes() 880 881 # get data types of columns 882 result = manifest_new.dtypes.to_frame("dtypes").reset_index() 883 884 # return the result as a dictionary 885 result_dict = result.set_index("index")["dtypes"].astype(str).to_dict() 886 887 return result_dict
Fetch a manifest and return data types of all columns
Arguments:
- manifestId: synapse ID of a manifest
911 def add_entity_id_and_filename( 912 self, datasetId: str, manifest: pd.DataFrame 913 ) -> pd.DataFrame: 914 """add entityid and filename column to an existing manifest assuming entityId column is not already present 915 916 Args: 917 datasetId (str): dataset syn id 918 manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty 919 920 Returns: 921 pd.DataFrame: returns a pandas dataframe 922 """ 923 # get file names and entity ids of a given dataset 924 dataset_files_dict = self._get_files_metadata_from_dataset( 925 datasetId, only_new_files=False 926 ) 927 928 if dataset_files_dict: 929 # turn manifest dataframe back to a dictionary for operation 930 manifest_dict = manifest.to_dict("list") 931 932 # update Filename column 933 # add entityId column to the end 934 manifest_dict.update(dataset_files_dict) 935 936 # if the component column exists in existing manifest, fill up that column 937 if "Component" in manifest_dict.keys(): 938 manifest_dict["Component"] = manifest_dict["Component"] * max( 939 1, len(manifest_dict["Filename"]) 940 ) 941 942 # turn dictionary back to a dataframe 943 manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient="index") 944 manifest_df_updated = manifest_df_index.transpose() 945 946 # fill na with empty string 947 manifest_df_updated = manifest_df_updated.fillna("") 948 949 # drop index 950 manifest_df_updated = manifest_df_updated.reset_index(drop=True) 951 952 return manifest_df_updated 953 else: 954 return manifest
add entityid and filename column to an existing manifest assuming entityId column is not already present
Arguments:
- datasetId (str): dataset syn id
- manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty
Returns:
pd.DataFrame: returns a pandas dataframe
956 def fill_in_entity_id_filename( 957 self, datasetId: str, manifest: pd.DataFrame 958 ) -> Tuple[List, pd.DataFrame]: 959 """fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present. 960 961 Args: 962 datasetId (str): dataset syn id 963 manifest (pd.DataFrame): existing manifest dataframe. 964 965 Returns: 966 Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe 967 """ 968 # get dataset file names and entity id as a list of tuple 969 dataset_files = self.getFilesInStorageDataset(datasetId) 970 971 # update manifest with additional filenames, if any 972 # note that if there is an existing manifest and there are files in the dataset 973 # the columns Filename and entityId are assumed to be present in manifest schema 974 # TODO: use idiomatic panda syntax 975 if not dataset_files: 976 manifest = manifest.fillna("") 977 return dataset_files, manifest 978 979 all_files = self._get_file_entityIds( 980 dataset_files=dataset_files, only_new_files=False, manifest=manifest 981 ) 982 new_files = self._get_file_entityIds( 983 dataset_files=dataset_files, only_new_files=True, manifest=manifest 984 ) 985 986 all_files = pd.DataFrame(all_files) 987 new_files = pd.DataFrame(new_files) 988 989 # update manifest so that it contains new dataset files 990 manifest = ( 991 pd.concat([manifest, new_files], sort=False) 992 .reset_index() 993 .drop("index", axis=1) 994 ) 995 996 # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata 997 manifest_reindex = manifest.set_index("entityId") 998 all_files_reindex = all_files.set_index("entityId") 999 all_files_reindex_like_manifest = all_files_reindex.reindex_like( 1000 manifest_reindex 1001 ) 1002 1003 # Check if individual file paths in manifest and from synapse match 1004 file_paths_match = ( 1005 manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"] 1006 ) 1007 1008 # If all the paths do not match, update the manifest with the filepaths from synapse 1009 if not file_paths_match.all(): 1010 manifest_reindex.loc[ 1011 ~file_paths_match, "Filename" 1012 ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"] 1013 1014 # reformat manifest for further use 1015 manifest = manifest_reindex.reset_index() 1016 entityIdCol = manifest.pop("entityId") 1017 manifest.insert(len(manifest.columns), "entityId", entityIdCol) 1018 1019 manifest = manifest.fillna("") 1020 return dataset_files, manifest
fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present.
Arguments:
- datasetId (str): dataset syn id
- manifest (pd.DataFrame): existing manifest dataframe.
Returns:
Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe
1022 @tracer.start_as_current_span("SynapseStorage::updateDatasetManifestFiles") 1023 def updateDatasetManifestFiles( 1024 self, dmge: DataModelGraphExplorer, datasetId: str, store: bool = True 1025 ) -> Union[Tuple[str, pd.DataFrame], None]: 1026 """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any. 1027 1028 Args: 1029 dmge: DataModelGraphExplorer Instance 1030 datasetId: synapse ID of a storage dataset. 1031 store: if set to True store updated manifest in asset store; if set to False 1032 return a Pandas dataframe containing updated manifest but do not store to asset store 1033 1034 1035 Returns: 1036 Synapse ID of updated manifest and Pandas dataframe containing the updated manifest. 1037 If there is no existing manifest or if the manifest does not have an entityId column, return None 1038 """ 1039 1040 # get existing manifest Synapse ID 1041 manifest_id = self.getDatasetManifest(datasetId) 1042 1043 # if there is no manifest return None 1044 if not manifest_id: 1045 return None 1046 1047 manifest_entity = self.synapse_entity_tracker.get( 1048 synapse_id=manifest_id, syn=self.syn, download_file=True 1049 ) 1050 manifest_filepath = manifest_entity.path 1051 manifest = load_df(manifest_filepath) 1052 1053 # If the manifest does not have an entityId column, trigger a new manifest to be generated 1054 if "entityId" not in manifest.columns: 1055 return None 1056 1057 manifest_is_file_based = "Filename" in manifest.columns 1058 1059 if manifest_is_file_based: 1060 # update manifest with additional filenames, if any 1061 # note that if there is an existing manifest and there are files in the dataset 1062 # the columns Filename and entityId are assumed to be present in manifest schema 1063 # TODO: use idiomatic panda syntax 1064 dataset_files, manifest = self.fill_in_entity_id_filename( 1065 datasetId, manifest 1066 ) 1067 if dataset_files: 1068 # update the manifest file, so that it contains the relevant entity IDs 1069 if store: 1070 manifest.to_csv(manifest_filepath, index=False) 1071 1072 # store manifest and update associated metadata with manifest on Synapse 1073 manifest_id = self.associateMetadataWithFiles( 1074 dmge, manifest_filepath, datasetId 1075 ) 1076 1077 return manifest_id, manifest
Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any.
Arguments:
- dmge: DataModelGraphExplorer Instance
- datasetId: synapse ID of a storage dataset.
- store: if set to True store updated manifest in asset store; if set to False
- return a Pandas dataframe containing updated manifest but do not store to asset store
Returns:
Synapse ID of updated manifest and Pandas dataframe containing the updated manifest. If there is no existing manifest or if the manifest does not have an entityId column, return None
1123 @tracer.start_as_current_span("SynapseStorage::getProjectManifests") 1124 def getProjectManifests( 1125 self, projectId: str 1126 ) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]: 1127 """Gets all metadata manifest files across all datasets in a specified project. 1128 1129 Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest 1130 as a list of tuples, one for each manifest: 1131 [ 1132 ( 1133 (datasetId, dataName), 1134 (manifestId, manifestName), 1135 (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema 1136 ), 1137 ... 1138 ] 1139 1140 TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface 1141 """ 1142 component = None 1143 entity = None 1144 manifests = [] 1145 1146 datasets = self.getStorageDatasetsInProject(projectId) 1147 1148 for datasetId, datasetName in datasets: 1149 # encode information about the manifest in a simple list (so that R clients can unpack it) 1150 # eventually can serialize differently 1151 1152 # Get synID of manifest for a dataset 1153 manifestId = self.getDatasetManifest(datasetId) 1154 1155 # If a manifest exists, get the annotations for it, else return base 'manifest' tuple 1156 if manifestId: 1157 annotations = self.getFileAnnotations(manifestId) 1158 1159 # If manifest has annotations specifying component, use that 1160 if annotations and "Component" in annotations: 1161 component = annotations["Component"] 1162 entity = self.synapse_entity_tracker.get( 1163 synapse_id=manifestId, syn=self.syn, download_file=False 1164 ) 1165 manifest_name = entity["properties"]["name"] 1166 1167 # otherwise download the manifest and parse for information 1168 elif not annotations or "Component" not in annotations: 1169 logging.debug( 1170 f"No component annotations have been found for manifest {manifestId}. " 1171 "The manifest will be downloaded and parsed instead. " 1172 "For increased speed, add component annotations to manifest." 1173 ) 1174 1175 manifest_info = self.getDatasetManifest( 1176 datasetId, downloadFile=True 1177 ) 1178 manifest_name = manifest_info["properties"].get("name", "") 1179 1180 if not manifest_name: 1181 logger.error(f"Failed to download manifests from {datasetId}") 1182 1183 manifest_path = manifest_info["path"] 1184 1185 manifest_df = load_df(manifest_path) 1186 1187 # Get component from component column if it exists 1188 if ( 1189 "Component" in manifest_df 1190 and not manifest_df["Component"].empty 1191 ): 1192 list(set(manifest_df["Component"])) 1193 component = list(set(manifest_df["Component"])) 1194 1195 # Added to address issues raised during DCA testing 1196 if "" in component: 1197 component.remove("") 1198 1199 if len(component) == 1: 1200 component = component[0] 1201 elif len(component) > 1: 1202 logging.warning( 1203 f"Manifest {manifestId} is composed of multiple components. Schematic does not support mulit-component manifests at this time." 1204 "Behavior of manifests with multiple components is undefined" 1205 ) 1206 else: 1207 manifest_name = "" 1208 component = None 1209 if component: 1210 manifest = ( 1211 (datasetId, datasetName), 1212 (manifestId, manifest_name), 1213 (component, component), 1214 ) 1215 elif manifestId: 1216 logging.debug( 1217 f"Manifest {manifestId} does not have an associated Component" 1218 ) 1219 manifest = ( 1220 (datasetId, datasetName), 1221 (manifestId, manifest_name), 1222 ("", ""), 1223 ) 1224 else: 1225 manifest = ( 1226 (datasetId, datasetName), 1227 ("", ""), 1228 ("", ""), 1229 ) 1230 1231 if manifest: 1232 manifests.append(manifest) 1233 1234 return manifests
Gets all metadata manifest files across all datasets in a specified project.
Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest as a list of tuples, one for each manifest: [ ( (datasetId, dataName), (manifestId, manifestName), (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema ), ... ]
TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface
1236 def upload_project_manifests_to_synapse( 1237 self, dmge: DataModelGraphExplorer, projectId: str 1238 ) -> List[str]: 1239 """Upload all metadata manifest files across all datasets in a specified project as tables in Synapse. 1240 1241 Returns: String of all the manifest_table_ids of all the manifests that have been loaded. 1242 """ 1243 1244 manifests = [] 1245 manifest_loaded = [] 1246 datasets = self.getStorageDatasetsInProject(projectId) 1247 1248 for datasetId, datasetName in datasets: 1249 # encode information about the manifest in a simple list (so that R clients can unpack it) 1250 # eventually can serialize differently 1251 1252 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1253 1254 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1255 if manifest_info: 1256 manifest_id = manifest_info["properties"]["id"] 1257 manifest_name = manifest_info["properties"]["name"] 1258 manifest_path = manifest_info["path"] 1259 manifest_df = load_df(manifest_path) 1260 manifest_table_id = uploadDB( 1261 dmge=dmge, 1262 manifest=manifest, 1263 datasetId=datasetId, 1264 table_name=datasetName, 1265 ) 1266 manifest_loaded.append(datasetName) 1267 return manifest_loaded
Upload all metadata manifest files across all datasets in a specified project as tables in Synapse.
Returns: String of all the manifest_table_ids of all the manifests that have been loaded.
1269 def upload_annotated_project_manifests_to_synapse( 1270 self, projectId: str, path_to_json_ld: str, dry_run: bool = False 1271 ) -> List[str]: 1272 """ 1273 Purpose: 1274 For all manifests in a project, upload them as a table and add annotations manifest csv. 1275 Assumes the manifest is already present as a CSV in a dataset in the project. 1276 1277 """ 1278 # Instantiate DataModelParser 1279 data_model_parser = DataModelParser(path_to_data_model=path_to_json_ld) 1280 # Parse Model 1281 parsed_data_model = data_model_parser.parse_model() 1282 1283 # Instantiate DataModelGraph 1284 data_model_grapher = DataModelGraph(parsed_data_model) 1285 1286 # Generate graph 1287 graph_data_model = data_model_grapher.generate_data_model_graph() 1288 1289 # Instantiate DataModelGraphExplorer 1290 dmge = DataModelGraphExplorer(graph_data_model) 1291 1292 manifests = [] 1293 manifest_loaded = [] 1294 datasets = self.getStorageDatasetsInProject(projectId) 1295 for datasetId, datasetName in datasets: 1296 # encode information about the manifest in a simple list (so that R clients can unpack it) 1297 # eventually can serialize differently 1298 1299 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1300 manifests.append(manifest) 1301 1302 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1303 1304 if manifest_info: 1305 manifest_id = manifest_info["properties"]["id"] 1306 manifest_name = manifest_info["properties"]["name"] 1307 manifest_path = manifest_info["path"] 1308 manifest = ( 1309 (datasetId, datasetName), 1310 (manifest_id, manifest_name), 1311 ("", ""), 1312 ) 1313 if not dry_run: 1314 self.associateMetadataWithFiles( 1315 dmge, manifest_path, datasetId, manifest_record_type="table" 1316 ) 1317 manifest_loaded.append(manifest) 1318 1319 return manifests, manifest_loaded
Purpose:
For all manifests in a project, upload them as a table and add annotations manifest csv. Assumes the manifest is already present as a CSV in a dataset in the project.
1321 def move_entities_to_new_project( 1322 self, 1323 projectId: str, 1324 newProjectId: str, 1325 returnEntities: bool = False, 1326 dry_run: bool = False, 1327 ): 1328 """ 1329 For each manifest csv in a project, look for all the entitiy ids that are associated. 1330 Look up the entitiy in the files, move the entity to new project. 1331 """ 1332 1333 manifests = [] 1334 manifest_loaded = [] 1335 datasets = self.getStorageDatasetsInProject(projectId) 1336 if datasets: 1337 for datasetId, datasetName in datasets: 1338 # encode information about the manifest in a simple list (so that R clients can unpack it) 1339 # eventually can serialize differently 1340 1341 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1342 manifests.append(manifest) 1343 1344 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1345 if manifest_info: 1346 manifest_id = manifest_info["properties"]["id"] 1347 manifest_name = manifest_info["properties"]["name"] 1348 manifest_path = manifest_info["path"] 1349 manifest_df = load_df(manifest_path) 1350 1351 manifest = ( 1352 (datasetId, datasetName), 1353 (manifest_id, manifest_name), 1354 ("", ""), 1355 ) 1356 manifest_loaded.append(manifest) 1357 1358 annotation_entities = self.storageFileviewTable[ 1359 (self.storageFileviewTable["id"].isin(manifest_df["entityId"])) 1360 & (self.storageFileviewTable["type"] == "folder") 1361 ]["id"] 1362 1363 if returnEntities: 1364 for entityId in annotation_entities: 1365 if not dry_run: 1366 moved_entity = self.syn.move(entityId, datasetId) 1367 self.synapse_entity_tracker.add( 1368 synapse_id=moved_entity.id, entity=moved_entity 1369 ) 1370 else: 1371 logging.info( 1372 f"{entityId} will be moved to folder {datasetId}." 1373 ) 1374 else: 1375 # generate project folder 1376 archive_project_folder = Folder( 1377 projectId + "_archive", parent=newProjectId 1378 ) 1379 archive_project_folder = self.syn.store(archive_project_folder) 1380 self.synapse_entity_tracker.add( 1381 synapse_id=archive_project_folder.id, 1382 entity=archive_project_folder, 1383 ) 1384 1385 # generate dataset folder 1386 dataset_archive_folder = Folder( 1387 "_".join([datasetId, datasetName, "archive"]), 1388 parent=archive_project_folder.id, 1389 ) 1390 dataset_archive_folder = self.syn.store(dataset_archive_folder) 1391 self.synapse_entity_tracker.add( 1392 synapse_id=dataset_archive_folder.id, 1393 entity=dataset_archive_folder, 1394 ) 1395 1396 for entityId in annotation_entities: 1397 # move entities to folder 1398 if not dry_run: 1399 moved_entity = self.syn.move( 1400 entityId, dataset_archive_folder.id 1401 ) 1402 self.synapse_entity_tracker.add( 1403 synapse_id=moved_entity.id, entity=moved_entity 1404 ) 1405 else: 1406 logging.info( 1407 f"{entityId} will be moved to folder {dataset_archive_folder.id}." 1408 ) 1409 else: 1410 raise LookupError( 1411 f"No datasets were found in the specified project: {projectId}. Re-check specified master_fileview in CONFIG and retry." 1412 ) 1413 return manifests, manifest_loaded
For each manifest csv in a project, look for all the entitiy ids that are associated. Look up the entitiy in the files, move the entity to new project.
1415 @tracer.start_as_current_span("SynapseStorage::get_synapse_table") 1416 def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable]: 1417 """Download synapse table as a pd dataframe; return table schema and etags as results too 1418 1419 Args: 1420 synapse_id: synapse ID of the table to query 1421 """ 1422 1423 results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id)) 1424 df = results.asDataFrame( 1425 rowIdAndVersionInIndex=False, 1426 na_values=STR_NA_VALUES_FILTERED, 1427 keep_default_na=False, 1428 ) 1429 1430 return df, results
Download synapse table as a pd dataframe; return table schema and etags as results too
Arguments:
- synapse_id: synapse ID of the table to query
537 def wrapper(*args, **kwargs): 538 try: 539 return method(*args, **kwargs) 540 except SynapseHTTPError as ex: 541 str_message = str(ex).replace("\n", "") 542 if "trash" in str_message or "does not exist" in str_message: 543 logging.warning(str_message) 544 return None 545 else: 546 raise ex
Method to upload a database to an asset store. In synapse, this will upload a metadata table
Arguments:
- dmge: DataModelGraphExplorer object
- manifest: pd.Df manifest to upload
- datasetId: synID of the dataset for the manifest
- table_name: name of the table to be uploaded
- restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
- existingTableId: str of the synId of the existing table, if one already exists
- table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
- table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
Returns:
manifest_table_id: synID of the uploaded table manifest: the original manifset table_manifest: manifest formatted appropriately for the table
1481 @tracer.start_as_current_span("SynapseStorage::formatDB") 1482 def formatDB(self, dmge, manifest, table_column_names): 1483 """ 1484 Method to format a manifest appropriatly for upload as table 1485 1486 Args: 1487 dmge: DataModelGraphExplorer object 1488 manifest: pd.Df manifest to upload 1489 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 1490 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 1491 display label formatting. 1492 Returns: 1493 col_schema: schema for table columns: type, size, etc 1494 table_manifest: formatted manifest 1495 1496 """ 1497 # Rename the manifest columns to display names to match fileview 1498 1499 blacklist_chars = ["(", ")", ".", " ", "-"] 1500 manifest_columns = manifest.columns.tolist() 1501 1502 table_manifest = deepcopy(manifest) 1503 1504 if table_column_names == "display_name": 1505 cols = table_manifest.columns 1506 1507 elif table_column_names == "display_label": 1508 cols = [ 1509 str(col).translate({ord(x): "" for x in blacklist_chars}) 1510 for col in manifest_columns 1511 ] 1512 1513 elif table_column_names == "class_label": 1514 cols = [ 1515 get_class_label_from_display_name(str(col)).translate( 1516 {ord(x): "" for x in blacklist_chars} 1517 ) 1518 for col in manifest_columns 1519 ] 1520 else: 1521 ValueError( 1522 f"The provided table_column_name: {table_column_names} is not valid, please resubmit with an allowed value only." 1523 ) 1524 1525 cols = list(map(lambda x: x.replace("EntityId", "entityId"), cols)) 1526 1527 # Reset column names in table manifest 1528 table_manifest.columns = cols 1529 1530 # move entity id to end of df 1531 entity_col = table_manifest.pop("entityId") 1532 table_manifest.insert(len(table_manifest.columns), "entityId", entity_col) 1533 1534 # Get the column schema 1535 col_schema = as_table_columns(table_manifest) 1536 1537 # Set Id column length to 64 (for some reason not being auto set.) 1538 for i, col in enumerate(col_schema): 1539 if col["name"].lower() == "id": 1540 col_schema[i]["maximumSize"] = 64 1541 1542 return col_schema, table_manifest
Method to format a manifest appropriatly for upload as table
Arguments:
- dmge: DataModelGraphExplorer object
- manifest: pd.Df manifest to upload
- table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
Returns:
col_schema: schema for table columns: type, size, etc table_manifest: formatted manifest
1544 @tracer.start_as_current_span("SynapseStorage::buildDB") 1545 def buildDB( 1546 self, 1547 datasetId: str, 1548 table_name: str, 1549 col_schema: List, 1550 table_manifest: pd.DataFrame, 1551 table_manipulation: str, 1552 dmge: DataModelGraphExplorer, 1553 restrict: bool = False, 1554 ): 1555 """ 1556 Method to construct the table appropriately: create new table, replace existing, or upsert new into existing 1557 Calls TableOperations class to execute 1558 1559 Args: 1560 datasetId: synID of the dataset for the manifest 1561 table_name: name of the table to be uploaded 1562 col_schema: schema for table columns: type, size, etc from `formatDB` 1563 table_manifest: formatted manifest that can be uploaded as a table 1564 table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert) 1565 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 1566 1567 Returns: 1568 manifest_table_id: synID of the uploaded table 1569 1570 """ 1571 table_parent_id = self.getDatasetProject(datasetId=datasetId) 1572 existing_table_id = self.syn.findEntityId( 1573 name=table_name, parent=table_parent_id 1574 ) 1575 1576 tableOps = TableOperations( 1577 synStore=self, 1578 tableToLoad=table_manifest, 1579 tableName=table_name, 1580 datasetId=datasetId, 1581 existingTableId=existing_table_id, 1582 restrict=restrict, 1583 synapse_entity_tracker=self.synapse_entity_tracker, 1584 ) 1585 1586 if not table_manipulation or existing_table_id is None: 1587 manifest_table_id = tableOps.createTable( 1588 columnTypeDict=col_schema, 1589 specifySchema=True, 1590 ) 1591 elif existing_table_id is not None: 1592 if table_manipulation.lower() == "replace": 1593 manifest_table_id = tableOps.replaceTable( 1594 specifySchema=True, 1595 columnTypeDict=col_schema, 1596 ) 1597 elif table_manipulation.lower() == "upsert": 1598 manifest_table_id = tableOps.upsertTable( 1599 dmge=dmge, 1600 ) 1601 elif table_manipulation.lower() == "update": 1602 manifest_table_id = tableOps.updateTable() 1603 1604 if table_manipulation and table_manipulation.lower() == "upsert": 1605 table_entity = self.synapse_entity_tracker.get( 1606 synapse_id=existing_table_id or manifest_table_id, 1607 syn=self.syn, 1608 download_file=False, 1609 ) 1610 annos = OldAnnotations( 1611 id=table_entity.id, 1612 etag=table_entity.etag, 1613 values=table_entity.annotations, 1614 ) 1615 annos["primary_key"] = table_manifest["Component"][0] + "_id" 1616 annos = self.syn.set_annotations(annos) 1617 table_entity.etag = annos.etag 1618 table_entity.annotations = annos 1619 1620 return manifest_table_id
Method to construct the table appropriately: create new table, replace existing, or upsert new into existing Calls TableOperations class to execute
Arguments:
- datasetId: synID of the dataset for the manifest
- table_name: name of the table to be uploaded
- col_schema: schema for table columns: type, size, etc from
formatDB
- table_manifest: formatted manifest that can be uploaded as a table
- table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
- restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
Returns:
manifest_table_id: synID of the uploaded table
1622 @tracer.start_as_current_span("SynapseStorage::upload_manifest_file") 1623 def upload_manifest_file( 1624 self, 1625 manifest, 1626 metadataManifestPath, 1627 datasetId, 1628 restrict_manifest, 1629 component_name="", 1630 ): 1631 # Update manifest to have the new entityId column 1632 manifest.to_csv(metadataManifestPath, index=False) 1633 1634 # store manifest to Synapse as a CSV 1635 # update file name 1636 file_name_full = metadataManifestPath.split("/")[-1] 1637 file_extension = file_name_full.split(".")[-1] 1638 1639 # Differentiate "censored" and "uncensored" manifest 1640 if "censored" in file_name_full: 1641 file_name_new = ( 1642 os.path.basename(CONFIG.synapse_manifest_basename) 1643 + "_" 1644 + component_name 1645 + "_censored" 1646 + "." 1647 + file_extension 1648 ) 1649 else: 1650 file_name_new = ( 1651 os.path.basename(CONFIG.synapse_manifest_basename) 1652 + "_" 1653 + component_name 1654 + "." 1655 + file_extension 1656 ) 1657 1658 manifest_synapse_file = None 1659 try: 1660 # Rename the file to file_name_new then revert 1661 # This is to maintain the original file name in-case other code is 1662 # expecting that the file exists with the original name 1663 original_file_path = metadataManifestPath 1664 new_file_path = os.path.join( 1665 os.path.dirname(metadataManifestPath), file_name_new 1666 ) 1667 os.rename(original_file_path, new_file_path) 1668 1669 manifest_synapse_file = self._store_file_for_manifest_upload( 1670 new_file_path=new_file_path, 1671 dataset_id=datasetId, 1672 existing_file_name=file_name_full, 1673 file_name_new=file_name_new, 1674 restrict_manifest=restrict_manifest, 1675 ) 1676 manifest_synapse_file_id = manifest_synapse_file.id 1677 1678 finally: 1679 # Revert the file name back to the original 1680 os.rename(new_file_path, original_file_path) 1681 1682 if manifest_synapse_file: 1683 manifest_synapse_file.path = original_file_path 1684 1685 return manifest_synapse_file_id
1742 async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]: 1743 """get annotations asynchronously 1744 1745 Args: 1746 synapse_id (str): synapse id of the entity that the annotation belongs 1747 1748 Returns: 1749 Dict[str, Any]: The requested entity bundle matching 1750 <https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html> 1751 """ 1752 return await get_entity_id_bundle2( 1753 entity_id=synapse_id, 1754 request={"includeAnnotations": True}, 1755 synapse_client=self.syn, 1756 )
get annotations asynchronously
Arguments:
- synapse_id (str): synapse id of the entity that the annotation belongs
Returns:
Dict[str, Any]: The requested entity bundle matching https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html
1758 async def store_async_annotation(self, annotation_dict: dict) -> Annotations: 1759 """store annotation in an async way 1760 1761 Args: 1762 annotation_dict (dict): annotation in a dictionary format 1763 1764 Returns: 1765 Annotations: The stored annotations. 1766 """ 1767 annotation_data = Annotations.from_dict( 1768 synapse_annotations=annotation_dict["annotations"]["annotations"] 1769 ) 1770 annotation_class = Annotations( 1771 annotations=annotation_data, 1772 etag=annotation_dict["annotations"]["etag"], 1773 id=annotation_dict["annotations"]["id"], 1774 ) 1775 annotation_storage_result = await annotation_class.store_async( 1776 synapse_client=self.syn 1777 ) 1778 local_entity = self.synapse_entity_tracker.get( 1779 synapse_id=annotation_dict["annotations"]["id"], 1780 syn=self.syn, 1781 download_file=False, 1782 retrieve_if_not_present=False, 1783 ) 1784 if local_entity: 1785 local_entity.etag = annotation_storage_result.etag 1786 local_entity.annotations = annotation_storage_result 1787 return annotation_storage_result
store annotation in an async way
Arguments:
- annotation_dict (dict): annotation in a dictionary format
Returns:
Annotations: The stored annotations.
1789 def process_row_annotations( 1790 self, 1791 dmge: DataModelGraphExplorer, 1792 metadata_syn: Dict[str, Any], 1793 hide_blanks: bool, 1794 csv_list_regex: str, 1795 annos: Dict[str, Any], 1796 annotation_keys: str, 1797 ) -> Dict[str, Any]: 1798 """Processes metadata annotations based on the logic below: 1799 1. Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is: 1800 An empty or whitespace-only string. 1801 A NaN value (if the annotation is a float). 1802 if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key. 1803 if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key. 1804 1805 2. If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name". 1806 Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key. 1807 1808 3. For any other conditions, assigns the original value of anno_v to the annotation key (anno_k). 1809 1810 4. Returns the updated annotations dictionary. 1811 1812 Args: 1813 dmge (DataModelGraphExplorer): data model graph explorer 1814 metadata_syn (dict): metadata used for Synapse storage 1815 hideBlanks (bool): if true, does not upload annotation keys with blank values. 1816 csv_list_regex (str): Regex to match with comma separated list 1817 annos (Dict[str, Any]): dictionary of annotation returned from synapse 1818 annotation_keys (str): display_label/class_label 1819 1820 Returns: 1821 Dict[str, Any]: annotations as a dictionary 1822 1823 ```mermaid 1824 flowchart TD 1825 A[Start] --> C{Is anno_v empty, whitespace, or NaN?} 1826 C -- Yes --> D{Is hide_blanks True?} 1827 D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing] 1828 D -- No --> F[Assign empty string to annotation key] 1829 C -- No --> G{Is anno_v a string?} 1830 G -- No --> H[Assign original value of anno_v to annotation key] 1831 G -- Yes --> I{Does anno_v match csv_list_regex?} 1832 I -- Yes --> J[Get validation rule of anno_k] 1833 J --> K{Does the validation rule contain 'list'} 1834 K -- Yes --> L[Split anno_v by commas and assign as list] 1835 I -- No --> H 1836 K -- No --> H 1837 ``` 1838 """ 1839 for anno_k, anno_v in metadata_syn.items(): 1840 # Remove keys with nan or empty string values or string that only contains white space from dict of annotations to be uploaded 1841 # if present on current data annotation 1842 if hide_blanks and ( 1843 (isinstance(anno_v, str) and anno_v.strip() == "") 1844 or (isinstance(anno_v, float) and np.isnan(anno_v)) 1845 ): 1846 annos["annotations"]["annotations"].pop(anno_k) if anno_k in annos[ 1847 "annotations" 1848 ]["annotations"].keys() else annos["annotations"]["annotations"] 1849 continue 1850 1851 # Otherwise save annotation as approrpriate 1852 if isinstance(anno_v, float) and np.isnan(anno_v): 1853 annos["annotations"]["annotations"][anno_k] = "" 1854 continue 1855 1856 # Handle strings that match the csv_list_regex and pass the validation rule 1857 if isinstance(anno_v, str) and re.fullmatch(csv_list_regex, anno_v): 1858 # Use a dictionary to dynamically choose the argument 1859 param = ( 1860 {"node_display_name": anno_k} 1861 if annotation_keys == "display_label" 1862 else {"node_label": anno_k} 1863 ) 1864 node_validation_rules = dmge.get_node_validation_rules(**param) 1865 1866 if rule_in_rule_list("list", node_validation_rules): 1867 annos["annotations"]["annotations"][anno_k] = anno_v.split(",") 1868 continue 1869 # default: assign the original value 1870 annos["annotations"]["annotations"][anno_k] = anno_v 1871 1872 return annos
Processes metadata annotations based on the logic below:
- Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is: An empty or whitespace-only string. A NaN value (if the annotation is a float). if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key. if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key.
If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name". Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key.
For any other conditions, assigns the original value of anno_v to the annotation key (anno_k).
Returns the updated annotations dictionary.
Arguments:
- dmge (DataModelGraphExplorer): data model graph explorer
- metadata_syn (dict): metadata used for Synapse storage
- hideBlanks (bool): if true, does not upload annotation keys with blank values.
- csv_list_regex (str): Regex to match with comma separated list
- annos (Dict[str, Any]): dictionary of annotation returned from synapse
- annotation_keys (str): display_label/class_label
Returns:
Dict[str, Any]: annotations as a dictionary
flowchart TD A[Start] --> C{Is anno_v empty, whitespace, or NaN?} C -- Yes --> D{Is hide_blanks True?} D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing] D -- No --> F[Assign empty string to annotation key] C -- No --> G{Is anno_v a string?} G -- No --> H[Assign original value of anno_v to annotation key] G -- Yes --> I{Does anno_v match csv_list_regex?} I -- Yes --> J[Get validation rule of anno_k] J --> K{Does the validation rule contain 'list'} K -- Yes --> L[Split anno_v by commas and assign as list] I -- No --> H K -- No --> H
553 async def wrapper(*args: Any, **kwargs: Any) -> Any: 554 try: 555 return await method(*args, **kwargs) 556 except SynapseHTTPError as ex: 557 str_message = str(ex).replace("\n", "") 558 if "trash" in str_message or "does not exist" in str_message: 559 logging.warning(str_message) 560 return None 561 else: 562 raise ex
537 def wrapper(*args, **kwargs): 538 try: 539 return method(*args, **kwargs) 540 except SynapseHTTPError as ex: 541 str_message = str(ex).replace("\n", "") 542 if "trash" in str_message or "does not exist" in str_message: 543 logging.warning(str_message) 544 return None 545 else: 546 raise ex
Set annotations for the manifest (as a whole) so they can be applied to the manifest table or csv. For now just getting the Component.
2234 @tracer.start_as_current_span("SynapseStorage::add_annotations_to_entities_files") 2235 async def add_annotations_to_entities_files( 2236 self, 2237 dmge, 2238 manifest, 2239 manifest_record_type: str, 2240 datasetId: str, 2241 hideBlanks: bool, 2242 manifest_synapse_table_id="", 2243 annotation_keys: str = "class_label", 2244 ): 2245 """ 2246 Depending on upload type add Ids to entityId row. Add anotations to connected 2247 files and folders. Despite the name of this function, it also applies to folders. 2248 2249 Args: 2250 dmge: DataModelGraphExplorer Object 2251 manifest (pd.DataFrame): loaded df containing user supplied data. 2252 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2253 datasetId (str): synapse ID of folder containing the dataset 2254 hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2255 manifest_synapse_table_id (str): Default is an empty string ''. 2256 annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display 2257 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2258 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2259 Returns: 2260 manifest (pd.DataFrame): modified to add entitiyId as appropriate 2261 2262 """ 2263 2264 # Expected behavior is to annotate files if `Filename` is present and if file_annotations_upload is set to True regardless of `-mrt` setting 2265 if "filename" in [col.lower() for col in manifest.columns]: 2266 # get current list of files and store as dataframe 2267 dataset_files = self.getFilesInStorageDataset(datasetId) 2268 files_and_entityIds = self._get_file_entityIds( 2269 dataset_files=dataset_files, only_new_files=False 2270 ) 2271 file_df = pd.DataFrame(files_and_entityIds) 2272 2273 # Merge dataframes to add entityIds 2274 manifest = manifest.merge( 2275 file_df, how="left", on="Filename", suffixes=["_x", None] 2276 ).drop("entityId_x", axis=1) 2277 2278 # Fill `entityId` for each row if missing and annotate entity as appropriate 2279 requests = set() 2280 for idx, row in manifest.iterrows(): 2281 if not row["entityId"] and ( 2282 manifest_record_type == "file_and_entities" 2283 or manifest_record_type == "table_file_and_entities" 2284 ): 2285 manifest, entityId = self._create_entity_id( 2286 idx, row, manifest, datasetId 2287 ) 2288 elif not row["entityId"] and manifest_record_type == "table_and_file": 2289 # If not using entityIds, fill with manifest_table_id so 2290 row["entityId"] = manifest_synapse_table_id 2291 manifest.loc[idx, "entityId"] = manifest_synapse_table_id 2292 entityId = "" 2293 # If the row is the manifest table, do not add annotations 2294 elif row["entityId"] == manifest_synapse_table_id: 2295 entityId = "" 2296 else: 2297 # get the file id of the file to annotate, collected in above step. 2298 entityId = row["entityId"] 2299 2300 # Adding annotations to connected files. 2301 if entityId: 2302 # Format annotations for Synapse 2303 annos_task = asyncio.create_task( 2304 self.format_row_annotations( 2305 dmge, row, entityId, hideBlanks, annotation_keys 2306 ) 2307 ) 2308 requests.add(annos_task) 2309 await self._process_store_annos(requests) 2310 return manifest
Depending on upload type add Ids to entityId row. Add anotations to connected files and folders. Despite the name of this function, it also applies to folders.
Arguments:
- dmge: DataModelGraphExplorer Object
- manifest (pd.DataFrame): loaded df containing user supplied data.
- manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
- datasetId (str): synapse ID of folder containing the dataset
- hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
- manifest_synapse_table_id (str): Default is an empty string ''.
- annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
Returns:
manifest (pd.DataFrame): modified to add entitiyId as appropriate
2312 @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_table") 2313 def upload_manifest_as_table( 2314 self, 2315 dmge: DataModelGraphExplorer, 2316 manifest: pd.DataFrame, 2317 metadataManifestPath: str, 2318 datasetId: str, 2319 table_name: str, 2320 component_name: str, 2321 restrict: bool, 2322 manifest_record_type: str, 2323 hideBlanks: bool, 2324 table_manipulation: str, 2325 table_column_names: str, 2326 annotation_keys: str, 2327 file_annotations_upload: bool = True, 2328 ): 2329 """Upload manifest to Synapse as a table and csv. 2330 Args: 2331 dmge: DataModelGraphExplorer object 2332 manifest (pd.DataFrame): loaded df containing user supplied data. 2333 metadataManifestPath: path to csv containing a validated metadata manifest. 2334 datasetId (str): synapse ID of folder containing the dataset 2335 table_name (str): Generated to name the table being uploaded. 2336 component_name (str): Name of the component manifest that is currently being uploaded. 2337 restrict (bool): Flag for censored data. 2338 manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2339 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2340 table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2341 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2342 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2343 display label formatting. 2344 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2345 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2346 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2347 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2348 Return: 2349 manifest_synapse_file_id: SynID of manifest csv uploaded to synapse. 2350 """ 2351 # Upload manifest as a table, get the ID and updated manifest. 2352 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2353 dmge=dmge, 2354 manifest=manifest, 2355 datasetId=datasetId, 2356 table_name=table_name, 2357 restrict=restrict, 2358 table_manipulation=table_manipulation, 2359 table_column_names=table_column_names, 2360 ) 2361 2362 if file_annotations_upload: 2363 manifest = asyncio.run( 2364 self.add_annotations_to_entities_files( 2365 dmge, 2366 manifest, 2367 manifest_record_type, 2368 datasetId, 2369 hideBlanks, 2370 manifest_synapse_table_id, 2371 annotation_keys, 2372 ) 2373 ) 2374 # Load manifest to synapse as a CSV File 2375 manifest_synapse_file_id = self.upload_manifest_file( 2376 manifest=manifest, 2377 metadataManifestPath=metadataManifestPath, 2378 datasetId=datasetId, 2379 restrict_manifest=restrict, 2380 component_name=component_name, 2381 ) 2382 2383 # Set annotations for the file manifest. 2384 manifest_annotations = self.format_manifest_annotations( 2385 manifest=manifest, manifest_synapse_id=manifest_synapse_file_id 2386 ) 2387 annos = self.syn.set_annotations(annotations=manifest_annotations) 2388 manifest_entity = self.synapse_entity_tracker.get( 2389 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2390 ) 2391 manifest_entity.annotations = annos 2392 manifest_entity.etag = annos.etag 2393 2394 logger.info("Associated manifest file with dataset on Synapse.") 2395 2396 # Update manifest Synapse table with new entity id column. 2397 manifest_synapse_table_id, manifest, _ = self.uploadDB( 2398 dmge=dmge, 2399 manifest=manifest, 2400 datasetId=datasetId, 2401 table_name=table_name, 2402 restrict=restrict, 2403 table_manipulation="update", 2404 table_column_names=table_column_names, 2405 ) 2406 2407 # Set annotations for the table manifest 2408 manifest_annotations = self.format_manifest_annotations( 2409 manifest=manifest, manifest_synapse_id=manifest_synapse_table_id 2410 ) 2411 annotations_manifest_table = self.syn.set_annotations( 2412 annotations=manifest_annotations 2413 ) 2414 manifest_table_entity = self.synapse_entity_tracker.get( 2415 synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False 2416 ) 2417 manifest_table_entity.annotations = annotations_manifest_table 2418 manifest_table_entity.etag = annotations_manifest_table.etag 2419 2420 return manifest_synapse_file_id
Upload manifest to Synapse as a table and csv.
Arguments:
- dmge: DataModelGraphExplorer object
- manifest (pd.DataFrame): loaded df containing user supplied data.
- metadataManifestPath: path to csv containing a validated metadata manifest.
- datasetId (str): synapse ID of folder containing the dataset
- table_name (str): Generated to name the table being uploaded.
- component_name (str): Name of the component manifest that is currently being uploaded.
- restrict (bool): Flag for censored data.
- manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
- hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
- table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
- table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
- annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
- file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
Return:
manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2422 @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_csv") 2423 def upload_manifest_as_csv( 2424 self, 2425 dmge, 2426 manifest, 2427 metadataManifestPath, 2428 datasetId, 2429 restrict, 2430 manifest_record_type, 2431 hideBlanks, 2432 component_name, 2433 annotation_keys: str, 2434 file_annotations_upload: bool = True, 2435 ): 2436 """Upload manifest to Synapse as a csv only. 2437 Args: 2438 dmge: DataModelGraphExplorer object 2439 manifest (pd.DataFrame): loaded df containing user supplied data. 2440 metadataManifestPath: path to csv containing a validated metadata manifest. 2441 datasetId (str): synapse ID of folder containing the dataset 2442 restrict (bool): Flag for censored data. 2443 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2444 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2445 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2446 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2447 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2448 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2449 Return: 2450 manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. 2451 """ 2452 if file_annotations_upload: 2453 manifest = asyncio.run( 2454 self.add_annotations_to_entities_files( 2455 dmge, 2456 manifest, 2457 manifest_record_type, 2458 datasetId, 2459 hideBlanks, 2460 annotation_keys=annotation_keys, 2461 ) 2462 ) 2463 2464 # Load manifest to synapse as a CSV File 2465 manifest_synapse_file_id = self.upload_manifest_file( 2466 manifest, 2467 metadataManifestPath, 2468 datasetId, 2469 restrict, 2470 component_name=component_name, 2471 ) 2472 2473 # Set annotations for the file manifest. 2474 manifest_annotations = self.format_manifest_annotations( 2475 manifest, manifest_synapse_file_id 2476 ) 2477 annos = self.syn.set_annotations(manifest_annotations) 2478 manifest_entity = self.synapse_entity_tracker.get( 2479 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2480 ) 2481 manifest_entity.annotations = annos 2482 manifest_entity.etag = annos.etag 2483 2484 logger.info("Associated manifest file with dataset on Synapse.") 2485 2486 return manifest_synapse_file_id
Upload manifest to Synapse as a csv only.
Arguments:
- dmge: DataModelGraphExplorer object
- manifest (pd.DataFrame): loaded df containing user supplied data.
- metadataManifestPath: path to csv containing a validated metadata manifest.
- datasetId (str): synapse ID of folder containing the dataset
- restrict (bool): Flag for censored data.
- manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
- hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
- annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
- file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
Return:
manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2488 @tracer.start_as_current_span("SynapseStorage::upload_manifest_combo") 2489 def upload_manifest_combo( 2490 self, 2491 dmge, 2492 manifest, 2493 metadataManifestPath, 2494 datasetId, 2495 table_name, 2496 component_name, 2497 restrict, 2498 manifest_record_type, 2499 hideBlanks, 2500 table_manipulation, 2501 table_column_names: str, 2502 annotation_keys: str, 2503 file_annotations_upload: bool = True, 2504 ): 2505 """Upload manifest to Synapse as a table and CSV with entities. 2506 Args: 2507 dmge: DataModelGraphExplorer object 2508 manifest (pd.DataFrame): loaded df containing user supplied data. 2509 metadataManifestPath: path to csv containing a validated metadata manifest. 2510 datasetId (str): synapse ID of folder containing the dataset 2511 table_name (str): Generated to name the table being uploaded. 2512 component_name (str): Name of the component manifest that is currently being uploaded. 2513 restrict (bool): Flag for censored data. 2514 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2515 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2516 table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2517 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2518 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2519 display label formatting. 2520 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2521 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2522 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2523 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2524 Return: 2525 manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. 2526 """ 2527 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2528 dmge=dmge, 2529 manifest=manifest, 2530 datasetId=datasetId, 2531 table_name=table_name, 2532 restrict=restrict, 2533 table_manipulation=table_manipulation, 2534 table_column_names=table_column_names, 2535 ) 2536 2537 if file_annotations_upload: 2538 manifest = asyncio.run( 2539 self.add_annotations_to_entities_files( 2540 dmge, 2541 manifest, 2542 manifest_record_type, 2543 datasetId, 2544 hideBlanks, 2545 manifest_synapse_table_id, 2546 annotation_keys=annotation_keys, 2547 ) 2548 ) 2549 2550 # Load manifest to synapse as a CSV File 2551 manifest_synapse_file_id = self.upload_manifest_file( 2552 manifest, metadataManifestPath, datasetId, restrict, component_name 2553 ) 2554 2555 # Set annotations for the file manifest. 2556 manifest_annotations = self.format_manifest_annotations( 2557 manifest, manifest_synapse_file_id 2558 ) 2559 file_manifest_annoations = self.syn.set_annotations(manifest_annotations) 2560 manifest_entity = self.synapse_entity_tracker.get( 2561 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2562 ) 2563 manifest_entity.annotations = file_manifest_annoations 2564 manifest_entity.etag = file_manifest_annoations.etag 2565 logger.info("Associated manifest file with dataset on Synapse.") 2566 2567 # Update manifest Synapse table with new entity id column. 2568 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2569 dmge=dmge, 2570 manifest=manifest, 2571 datasetId=datasetId, 2572 table_name=table_name, 2573 restrict=restrict, 2574 table_manipulation="update", 2575 table_column_names=table_column_names, 2576 ) 2577 2578 # Set annotations for the table manifest 2579 manifest_annotations = self.format_manifest_annotations( 2580 manifest, manifest_synapse_table_id 2581 ) 2582 table_manifest_annotations = self.syn.set_annotations(manifest_annotations) 2583 manifest_entity = self.synapse_entity_tracker.get( 2584 synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False 2585 ) 2586 manifest_entity.annotations = table_manifest_annotations 2587 manifest_entity.etag = table_manifest_annotations.etag 2588 return manifest_synapse_file_id
Upload manifest to Synapse as a table and CSV with entities.
Arguments:
- dmge: DataModelGraphExplorer object
- manifest (pd.DataFrame): loaded df containing user supplied data.
- metadataManifestPath: path to csv containing a validated metadata manifest.
- datasetId (str): synapse ID of folder containing the dataset
- table_name (str): Generated to name the table being uploaded.
- component_name (str): Name of the component manifest that is currently being uploaded.
- restrict (bool): Flag for censored data.
- manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
- hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
- table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
- table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
- annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
- file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
Return:
manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2590 @tracer.start_as_current_span("SynapseStorage::associateMetadataWithFiles") 2591 def associateMetadataWithFiles( 2592 self, 2593 dmge: DataModelGraphExplorer, 2594 metadataManifestPath: str, 2595 datasetId: str, 2596 manifest_record_type: str = "table_file_and_entities", 2597 hideBlanks: bool = False, 2598 restrict_manifest=False, 2599 table_manipulation: str = "replace", 2600 table_column_names: str = "class_label", 2601 annotation_keys: str = "class_label", 2602 file_annotations_upload: bool = True, 2603 ) -> str: 2604 """Associate metadata with files in a storage dataset already on Synapse. 2605 Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file. 2606 2607 If this is a new manifest there could be no Synapse entities associated with the rows of this manifest 2608 this may be due to data type (e.g. clinical data) being tabular 2609 and not requiring files; to utilize uniform interfaces downstream 2610 (i.e. fileviews), a Synapse entity (a folder) is created for each row 2611 and an entity column is added to the manifest containing the resulting 2612 entity IDs; a table is also created at present as an additional interface 2613 for downstream query and interaction with the data. 2614 2615 Args: 2616 dmge: DataModelGraphExplorer Object 2617 metadataManifestPath: path to csv containing a validated metadata manifest. 2618 The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type. 2619 Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item. 2620 In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file. 2621 datasetId: synapse ID of folder containing the dataset 2622 manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination. 2623 hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2624 restrict_manifest (bool): Default is false. Flag for censored data. 2625 table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2626 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2627 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2628 display label formatting. 2629 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2630 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2631 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2632 Returns: 2633 manifest_synapse_file_id: SynID of manifest csv uploaded to synapse. 2634 """ 2635 # Read new manifest CSV: 2636 manifest = self._read_manifest(metadataManifestPath) 2637 manifest = self._add_id_columns_to_manifest(manifest, dmge) 2638 2639 table_name, component_name = self._generate_table_name(manifest) 2640 2641 # Upload manifest to synapse based on user input (manifest_record_type) 2642 if manifest_record_type == "file_only": 2643 manifest_synapse_file_id = self.upload_manifest_as_csv( 2644 dmge=dmge, 2645 manifest=manifest, 2646 metadataManifestPath=metadataManifestPath, 2647 datasetId=datasetId, 2648 restrict=restrict_manifest, 2649 hideBlanks=hideBlanks, 2650 manifest_record_type=manifest_record_type, 2651 component_name=component_name, 2652 annotation_keys=annotation_keys, 2653 file_annotations_upload=file_annotations_upload, 2654 ) 2655 elif manifest_record_type == "table_and_file": 2656 manifest_synapse_file_id = self.upload_manifest_as_table( 2657 dmge=dmge, 2658 manifest=manifest, 2659 metadataManifestPath=metadataManifestPath, 2660 datasetId=datasetId, 2661 table_name=table_name, 2662 component_name=component_name, 2663 restrict=restrict_manifest, 2664 hideBlanks=hideBlanks, 2665 manifest_record_type=manifest_record_type, 2666 table_manipulation=table_manipulation, 2667 table_column_names=table_column_names, 2668 annotation_keys=annotation_keys, 2669 file_annotations_upload=file_annotations_upload, 2670 ) 2671 elif manifest_record_type == "file_and_entities": 2672 manifest_synapse_file_id = self.upload_manifest_as_csv( 2673 dmge=dmge, 2674 manifest=manifest, 2675 metadataManifestPath=metadataManifestPath, 2676 datasetId=datasetId, 2677 restrict=restrict_manifest, 2678 hideBlanks=hideBlanks, 2679 manifest_record_type=manifest_record_type, 2680 component_name=component_name, 2681 annotation_keys=annotation_keys, 2682 file_annotations_upload=file_annotations_upload, 2683 ) 2684 elif manifest_record_type == "table_file_and_entities": 2685 manifest_synapse_file_id = self.upload_manifest_combo( 2686 dmge=dmge, 2687 manifest=manifest, 2688 metadataManifestPath=metadataManifestPath, 2689 datasetId=datasetId, 2690 table_name=table_name, 2691 component_name=component_name, 2692 restrict=restrict_manifest, 2693 hideBlanks=hideBlanks, 2694 manifest_record_type=manifest_record_type, 2695 table_manipulation=table_manipulation, 2696 table_column_names=table_column_names, 2697 annotation_keys=annotation_keys, 2698 file_annotations_upload=file_annotations_upload, 2699 ) 2700 else: 2701 raise ValueError("Please enter a valid manifest_record_type.") 2702 return manifest_synapse_file_id
Associate metadata with files in a storage dataset already on Synapse. Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file.
If this is a new manifest there could be no Synapse entities associated with the rows of this manifest this may be due to data type (e.g. clinical data) being tabular and not requiring files; to utilize uniform interfaces downstream (i.e. fileviews), a Synapse entity (a folder) is created for each row and an entity column is added to the manifest containing the resulting entity IDs; a table is also created at present as an additional interface for downstream query and interaction with the data.
Arguments:
- dmge: DataModelGraphExplorer Object
- metadataManifestPath: path to csv containing a validated metadata manifest.
- The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type.
- Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item.
- In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file.
- datasetId: synapse ID of folder containing the dataset
- manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination.
- hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
- restrict_manifest (bool): Default is false. Flag for censored data.
- table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
- table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
- annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
Returns:
manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2704 def getTableAnnotations(self, table_id: str): 2705 """Generate dictionary of annotations for the given Synapse file. 2706 Synapse returns all custom annotations as lists since they 2707 can contain multiple values. In all cases, the values will 2708 be converted into strings and concatenated with ", ". 2709 2710 Args: 2711 fileId (str): Synapse ID for dataset file. 2712 2713 Returns: 2714 dict: Annotations as comma-separated strings. 2715 """ 2716 try: 2717 entity = self.synapse_entity_tracker.get( 2718 synapse_id=table_id, syn=self.syn, download_file=False 2719 ) 2720 is_table = entity.concreteType.endswith(".TableEntity") 2721 annotations_raw = entity.annotations 2722 except SynapseHTTPError: 2723 # If an error occurs with retrieving entity, skip it 2724 # This could be caused by a temporary file view that 2725 # was deleted since its ID was retrieved 2726 is_file, is_table = False, False 2727 2728 # Skip anything that isn't a file or folder 2729 if not (is_table): 2730 return None 2731 2732 annotations = self.getEntityAnnotations(table_id, entity, annotations_raw) 2733 2734 return annotations
Generate dictionary of annotations for the given Synapse file. Synapse returns all custom annotations as lists since they can contain multiple values. In all cases, the values will be converted into strings and concatenated with ", ".
Arguments:
- fileId (str): Synapse ID for dataset file.
Returns:
dict: Annotations as comma-separated strings.
2736 def getFileAnnotations(self, fileId: str) -> Dict[str, str]: 2737 """Generate dictionary of annotations for the given Synapse file. 2738 Synapse returns all custom annotations as lists since they 2739 can contain multiple values. In all cases, the values will 2740 be converted into strings and concatenated with ", ". 2741 2742 Args: 2743 fileId (str): Synapse ID for dataset file. 2744 2745 Returns: 2746 dict: Annotations as comma-separated strings. 2747 """ 2748 2749 # Get entity metadata, including annotations 2750 try: 2751 entity = self.synapse_entity_tracker.get( 2752 synapse_id=fileId, syn=self.syn, download_file=False 2753 ) 2754 is_file = entity.concreteType.endswith(".FileEntity") 2755 is_folder = entity.concreteType.endswith(".Folder") 2756 annotations_raw = entity.annotations 2757 except SynapseHTTPError: 2758 # If an error occurs with retrieving entity, skip it 2759 # This could be caused by a temporary file view that 2760 # was deleted since its ID was retrieved 2761 is_file, is_folder = False, False 2762 2763 # Skip anything that isn't a file or folder 2764 if not (is_file or is_folder): 2765 return None 2766 2767 annotations = self.getEntityAnnotations(fileId, entity, annotations_raw) 2768 2769 return annotations
Generate dictionary of annotations for the given Synapse file. Synapse returns all custom annotations as lists since they can contain multiple values. In all cases, the values will be converted into strings and concatenated with ", ".
Arguments:
- fileId (str): Synapse ID for dataset file.
Returns:
dict: Annotations as comma-separated strings.
2771 def getEntityAnnotations(self, fileId, entity, annotations_raw): 2772 # Extract annotations from their lists and stringify. For example: 2773 # {'YearofBirth': [1980], 'author': ['bruno', 'milen', 'sujay']} 2774 annotations = dict() 2775 for key, vals in annotations_raw.items(): 2776 if isinstance(vals, list) and len(vals) == 1: 2777 annotations[key] = str(vals[0]) 2778 else: 2779 annotations[key] = ", ".join(str(v) for v in vals) 2780 2781 # Add the file entity ID and eTag, which weren't lists 2782 assert fileId == entity.id, ( 2783 "For some reason, the Synapse ID in the response doesn't match" 2784 "the Synapse ID sent in the request (via synapseclient)." 2785 ) 2786 annotations["entityId"] = fileId 2787 annotations["eTag"] = entity.etag 2788 2789 return annotations
2791 def getDatasetAnnotations( 2792 self, datasetId: str, fill_na: bool = True, force_batch: bool = False 2793 ) -> pd.DataFrame: 2794 """Generate table for annotations across all files in given dataset. 2795 2796 Args: 2797 datasetId (str): Synapse ID for dataset folder. 2798 fill_na (bool): Whether to replace missing values with 2799 blank strings. 2800 force_batch (bool): Whether to force the function to use 2801 the batch mode, which uses a file view to retrieve 2802 annotations for a given dataset. Default to False 2803 unless there are more than 50 files in the dataset. 2804 2805 Returns: 2806 pd.DataFrame: Table of annotations. 2807 """ 2808 # Get all files in given dataset 2809 dataset_files = self.getFilesInStorageDataset(datasetId) 2810 2811 # if there are no dataset files, there are no annotations 2812 # return None 2813 if not dataset_files: 2814 return pd.DataFrame() 2815 2816 dataset_files_map = dict(dataset_files) 2817 dataset_file_ids, _ = list(zip(*dataset_files)) 2818 2819 # Get annotations for each file from Step 1 2820 # Batch mode 2821 try_batch = len(dataset_files) >= 50 or force_batch 2822 if try_batch: 2823 try: 2824 logger.info("Trying batch mode for retrieving Synapse annotations") 2825 table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids) 2826 except (SynapseAuthenticationError, SynapseHTTPError, ValueError): 2827 logger.info( 2828 f"Unable to create a temporary file view bound to {datasetId}. " 2829 "Defaulting to slower iterative retrieval of annotations." 2830 ) 2831 # Default to the slower non-batch method 2832 logger.info("Batch mode failed (probably due to permission error)") 2833 try_batch = False 2834 2835 # Non-batch mode 2836 if not try_batch: 2837 logger.info("Using slower (non-batch) sequential mode") 2838 records = [self.getFileAnnotations(i) for i in dataset_file_ids] 2839 # Remove any annotations for non-file/folders (stored as None) 2840 records = filter(None, records) 2841 table = pd.DataFrame.from_records(records) 2842 2843 # Add filenames for the files that "survived" annotation retrieval 2844 filenames = [dataset_files_map[i] for i in table["entityId"]] 2845 2846 if "Filename" not in table.columns: 2847 table.insert(0, "Filename", filenames) 2848 2849 # Ensure that entityId and eTag are at the end 2850 entity_ids = table.pop("entityId") 2851 etags = table.pop("eTag") 2852 table.insert(len(table.columns), "entityId", entity_ids) 2853 table.insert(len(table.columns), "eTag", etags) 2854 2855 # Missing values are filled in with empty strings for Google Sheets 2856 if fill_na: 2857 table.fillna("", inplace=True) 2858 2859 # Force all values as strings 2860 return table.astype(str)
Generate table for annotations across all files in given dataset.
Arguments:
- datasetId (str): Synapse ID for dataset folder.
- fill_na (bool): Whether to replace missing values with blank strings.
- force_batch (bool): Whether to force the function to use the batch mode, which uses a file view to retrieve annotations for a given dataset. Default to False unless there are more than 50 files in the dataset.
Returns:
pd.DataFrame: Table of annotations.
2874 @tracer.start_as_current_span("SynapseStorage::getDatasetProject") 2875 @retry( 2876 stop=stop_after_attempt(5), 2877 wait=wait_chain( 2878 *[wait_fixed(10) for i in range(2)] 2879 + [wait_fixed(15) for i in range(2)] 2880 + [wait_fixed(20)] 2881 ), 2882 retry=retry_if_exception_type(LookupError), 2883 retry_error_callback=raise_final_error, 2884 ) 2885 def getDatasetProject(self, datasetId: str) -> str: 2886 """Get parent project for a given dataset ID. 2887 2888 Args: 2889 datasetId (str): Synapse entity ID (folder or project). 2890 2891 Raises: 2892 ValueError: Raised if Synapse ID cannot be retrieved 2893 by the user or if it doesn't appear in the file view. 2894 2895 Returns: 2896 str: The Synapse ID for the parent project. 2897 """ 2898 2899 # Subset main file view 2900 dataset_index = self.storageFileviewTable["id"] == datasetId 2901 dataset_row = self.storageFileviewTable[dataset_index] 2902 2903 # re-query if no datasets found 2904 if dataset_row.empty: 2905 sleep(5) 2906 self.query_fileview(force_requery=True) 2907 # Subset main file view 2908 dataset_index = self.storageFileviewTable["id"] == datasetId 2909 dataset_row = self.storageFileviewTable[dataset_index] 2910 2911 # Return `projectId` for given row if only one found 2912 if len(dataset_row) == 1: 2913 dataset_project = dataset_row["projectId"].values[0] 2914 return dataset_project 2915 2916 # Otherwise, check if already project itself 2917 try: 2918 syn_object = self.synapse_entity_tracker.get( 2919 synapse_id=datasetId, syn=self.syn, download_file=False 2920 ) 2921 if syn_object.properties["concreteType"].endswith("Project"): 2922 return datasetId 2923 except SynapseHTTPError: 2924 raise PermissionError( 2925 f"The given dataset ({datasetId}) isn't accessible with this " 2926 "user. This might be caused by a typo in the dataset Synapse ID." 2927 ) 2928 2929 # If not, then assume dataset not in file view 2930 raise LookupError( 2931 f"The given dataset ({datasetId}) doesn't appear in the " 2932 f"configured file view ({self.storageFileview}). This might " 2933 "mean that the file view's scope needs to be updated." 2934 )
Get parent project for a given dataset ID.
Arguments:
- datasetId (str): Synapse entity ID (folder or project).
Raises:
- ValueError: Raised if Synapse ID cannot be retrieved
- by the user or if it doesn't appear in the file view.
Returns:
str: The Synapse ID for the parent project.
2936 def getDatasetAnnotationsBatch( 2937 self, datasetId: str, dataset_file_ids: Sequence[str] = None 2938 ) -> pd.DataFrame: 2939 """Generate table for annotations across all files in given dataset. 2940 This function uses a temporary file view to generate a table 2941 instead of iteratively querying for individual entity annotations. 2942 This function is expected to run much faster than 2943 `self.getDatasetAnnotationsBatch` on large datasets. 2944 2945 Args: 2946 datasetId (str): Synapse ID for dataset folder. 2947 dataset_file_ids (Sequence[str]): List of Synapse IDs 2948 for dataset files/folders used to subset the table. 2949 2950 Returns: 2951 pd.DataFrame: Table of annotations. 2952 """ 2953 # Create data frame from annotations file view 2954 with DatasetFileView(datasetId, self.syn) as fileview: 2955 table = fileview.query() 2956 2957 if dataset_file_ids: 2958 table = table.loc[table.index.intersection(dataset_file_ids)] 2959 2960 table = table.reset_index(drop=True) 2961 2962 return table
Generate table for annotations across all files in given dataset.
This function uses a temporary file view to generate a table
instead of iteratively querying for individual entity annotations.
This function is expected to run much faster than
self.getDatasetAnnotationsBatch
on large datasets.
Arguments:
- datasetId (str): Synapse ID for dataset folder.
- dataset_file_ids (Sequence[str]): List of Synapse IDs for dataset files/folders used to subset the table.
Returns:
pd.DataFrame: Table of annotations.
2975class TableOperations: 2976 """ 2977 Object to hold functions for various table operations specific to the Synapse Asset Store. 2978 2979 Currently implement operations are: 2980 createTable: upload a manifest as a new table when none exist 2981 replaceTable: replace a metadata in a table from one manifest with metadata from another manifest 2982 updateTable: add a column to a table that already exists on synapse 2983 2984 Operations currently in development are: 2985 upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest 2986 """ 2987 2988 def __init__( 2989 self, 2990 synStore: SynapseStorage, 2991 tableToLoad: pd.DataFrame = None, 2992 tableName: str = None, 2993 datasetId: str = None, 2994 existingTableId: str = None, 2995 restrict: bool = False, 2996 synapse_entity_tracker: SynapseEntityTracker = None, 2997 ): 2998 """ 2999 Class governing table operations (creation, replacement, upserts, updates) in schematic 3000 3001 tableToLoad: manifest formatted appropriately for the table 3002 tableName: name of the table to be uploaded 3003 datasetId: synID of the dataset for the manifest 3004 existingTableId: synId of the table currently exising on synapse (if there is one) 3005 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 3006 synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities 3007 3008 """ 3009 self.synStore = synStore 3010 self.tableToLoad = tableToLoad 3011 self.tableName = tableName 3012 self.datasetId = datasetId 3013 self.existingTableId = existingTableId 3014 self.restrict = restrict 3015 self.synapse_entity_tracker = synapse_entity_tracker or SynapseEntityTracker() 3016 3017 @tracer.start_as_current_span("TableOperations::createTable") 3018 def createTable( 3019 self, 3020 columnTypeDict: dict = None, 3021 specifySchema: bool = True, 3022 ): 3023 """ 3024 Method to create a table from a metadata manifest and upload it to synapse 3025 3026 Args: 3027 columnTypeDict: dictionary schema for table columns: type, size, etc 3028 specifySchema: to specify a specific schema for the table format 3029 3030 Returns: 3031 table.schema.id: synID of the newly created table 3032 """ 3033 datasetEntity = self.synapse_entity_tracker.get( 3034 synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False 3035 ) 3036 datasetName = datasetEntity.name 3037 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3038 3039 if not self.tableName: 3040 self.tableName = datasetName + "table" 3041 datasetParentProject = self.synStore.getDatasetProject(self.datasetId) 3042 if specifySchema: 3043 if columnTypeDict == {}: 3044 logger.error("Did not provide a columnTypeDict.") 3045 # create list of columns: 3046 cols = [] 3047 for col in self.tableToLoad.columns: 3048 if col in table_schema_by_cname: 3049 col_type = table_schema_by_cname[col]["columnType"] 3050 max_size = ( 3051 table_schema_by_cname[col]["maximumSize"] 3052 if "maximumSize" in table_schema_by_cname[col].keys() 3053 else 100 3054 ) 3055 max_list_len = 250 3056 if max_size and max_list_len: 3057 cols.append( 3058 Column( 3059 name=col, 3060 columnType=col_type, 3061 maximumSize=max_size, 3062 maximumListLength=max_list_len, 3063 ) 3064 ) 3065 elif max_size: 3066 cols.append( 3067 Column(name=col, columnType=col_type, maximumSize=max_size) 3068 ) 3069 else: 3070 cols.append(Column(name=col, columnType=col_type)) 3071 else: 3072 # TODO add warning that the given col was not found and it's max size is set to 100 3073 cols.append(Column(name=col, columnType="STRING", maximumSize=100)) 3074 schema = Schema( 3075 name=self.tableName, columns=cols, parent=datasetParentProject 3076 ) 3077 table = Table(schema, self.tableToLoad) 3078 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3079 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3080 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3081 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3082 return table.schema.id 3083 else: 3084 # For just uploading the tables to synapse using default 3085 # column types. 3086 table = build_table(self.tableName, datasetParentProject, self.tableToLoad) 3087 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3088 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3089 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3090 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3091 return table.schema.id 3092 3093 @tracer.start_as_current_span("TableOperations::replaceTable") 3094 def replaceTable( 3095 self, 3096 specifySchema: bool = True, 3097 columnTypeDict: dict = None, 3098 ): 3099 """ 3100 Method to replace an existing table on synapse with metadata from a new manifest 3101 3102 Args: 3103 specifySchema: to infer a schema for the table format 3104 columnTypeDict: dictionary schema for table columns: type, size, etc 3105 3106 Returns: 3107 existingTableId: synID of the already existing table that had its metadata replaced 3108 """ 3109 datasetEntity = self.synapse_entity_tracker.get( 3110 synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False 3111 ) 3112 3113 datasetName = datasetEntity.name 3114 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3115 existing_table, existing_results = self.synStore.get_synapse_table( 3116 self.existingTableId 3117 ) 3118 # remove rows 3119 self.synStore.syn.delete(existing_results) 3120 # Data changes such as removing all rows causes the eTag to change. 3121 self.synapse_entity_tracker.remove(synapse_id=self.existingTableId) 3122 # wait for row deletion to finish on synapse before getting empty table 3123 sleep(10) 3124 3125 # removes all current columns 3126 current_table = self.synapse_entity_tracker.get( 3127 synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False 3128 ) 3129 3130 current_columns = self.synStore.syn.getTableColumns(current_table) 3131 for col in current_columns: 3132 current_table.removeColumn(col) 3133 3134 if not self.tableName: 3135 self.tableName = datasetName + "table" 3136 3137 # Process columns according to manifest entries 3138 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3139 datasetParentProject = self.synStore.getDatasetProject(self.datasetId) 3140 if specifySchema: 3141 if columnTypeDict == {}: 3142 logger.error("Did not provide a columnTypeDict.") 3143 # create list of columns: 3144 cols = [] 3145 3146 for col in self.tableToLoad.columns: 3147 if col in table_schema_by_cname: 3148 col_type = table_schema_by_cname[col]["columnType"] 3149 max_size = ( 3150 table_schema_by_cname[col]["maximumSize"] 3151 if "maximumSize" in table_schema_by_cname[col].keys() 3152 else 100 3153 ) 3154 max_list_len = 250 3155 if max_size and max_list_len: 3156 cols.append( 3157 Column( 3158 name=col, 3159 columnType=col_type, 3160 maximumSize=max_size, 3161 maximumListLength=max_list_len, 3162 ) 3163 ) 3164 elif max_size: 3165 cols.append( 3166 Column(name=col, columnType=col_type, maximumSize=max_size) 3167 ) 3168 else: 3169 cols.append(Column(name=col, columnType=col_type)) 3170 else: 3171 # TODO add warning that the given col was not found and it's max size is set to 100 3172 cols.append(Column(name=col, columnType="STRING", maximumSize=100)) 3173 3174 # adds new columns to schema 3175 for col in cols: 3176 current_table.addColumn(col) 3177 table_result = self.synStore.syn.store( 3178 current_table, isRestricted=self.restrict 3179 ) 3180 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3181 # self.synapse_entity_tracker.add(synapse_id=table_result.schema.id, entity=table_result.schema) 3182 self.synapse_entity_tracker.remove(synapse_id=table_result.id) 3183 3184 # wait for synapse store to finish 3185 sleep(1) 3186 3187 # build schema and table from columns and store with necessary restrictions 3188 schema = Schema( 3189 name=self.tableName, columns=cols, parent=datasetParentProject 3190 ) 3191 schema.id = self.existingTableId 3192 table = Table(schema, self.tableToLoad, etag=existing_results.etag) 3193 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3194 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3195 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3196 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3197 else: 3198 logging.error("Must specify a schema for table replacements") 3199 3200 # remove system metadata from manifest 3201 existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True) 3202 return self.existingTableId 3203 3204 @tracer.start_as_current_span("TableOperations::_get_auth_token") 3205 def _get_auth_token( 3206 self, 3207 ): 3208 authtoken = None 3209 3210 # Get access token from environment variable if available 3211 # Primarily useful for testing environments, with other possible usefulness for containers 3212 env_access_token = os.getenv("SYNAPSE_ACCESS_TOKEN") 3213 if env_access_token: 3214 authtoken = env_access_token 3215 return authtoken 3216 3217 # Get token from authorization header 3218 # Primarily useful for API endpoint functionality 3219 if "Authorization" in self.synStore.syn.default_headers: 3220 authtoken = self.synStore.syn.default_headers["Authorization"].split( 3221 "Bearer " 3222 )[-1] 3223 return authtoken 3224 3225 # retrive credentials from synapse object 3226 # Primarily useful for local users, could only be stored here when a .synapseConfig file is used, but including to be safe 3227 synapse_object_creds = self.synStore.syn.credentials 3228 if hasattr(synapse_object_creds, "_token"): 3229 authtoken = synapse_object_creds.secret 3230 3231 # Try getting creds from .synapseConfig file if it exists 3232 # Primarily useful for local users. Seems to correlate with credentials stored in synaspe object when logged in 3233 if os.path.exists(CONFIG.synapse_configuration_path): 3234 config = get_config_file(CONFIG.synapse_configuration_path) 3235 3236 # check which credentials are provided in file 3237 if config.has_option("authentication", "authtoken"): 3238 authtoken = config.get("authentication", "authtoken") 3239 3240 # raise error if required credentials are not found 3241 if not authtoken: 3242 raise NameError( 3243 "authtoken credentials could not be found in the environment, synapse object, or the .synapseConfig file" 3244 ) 3245 3246 return authtoken 3247 3248 @tracer.start_as_current_span("TableOperations::upsertTable") 3249 def upsertTable(self, dmge: DataModelGraphExplorer): 3250 """ 3251 Method to upsert rows from a new manifest into an existing table on synapse 3252 For upsert functionality to work, primary keys must follow the naming convention of <componenet>_id 3253 `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality. 3254 Currently it is required to use -dl/--use_display_label with table upserts. 3255 3256 3257 Args: 3258 dmge: DataModelGraphExplorer instance 3259 3260 Returns: 3261 existingTableId: synID of the already existing table that had its metadata replaced 3262 """ 3263 3264 authtoken = self._get_auth_token() 3265 3266 synapseDB = SynapseDatabase( 3267 auth_token=authtoken, 3268 project_id=self.synStore.getDatasetProject(self.datasetId), 3269 syn=self.synStore.syn, 3270 synapse_entity_tracker=self.synapse_entity_tracker, 3271 ) 3272 3273 try: 3274 # Try performing upsert 3275 synapseDB.upsert_table_rows( 3276 table_name=self.tableName, data=self.tableToLoad 3277 ) 3278 except SynapseHTTPError as ex: 3279 # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload 3280 if "Id is not a valid column name or id" in str(ex): 3281 self._update_table_uuid_column(dmge) 3282 synapseDB.upsert_table_rows( 3283 table_name=self.tableName, data=self.tableToLoad 3284 ) 3285 # Raise if other error 3286 else: 3287 raise ex 3288 3289 return self.existingTableId 3290 3291 @tracer.start_as_current_span("TableOperations::_update_table_uuid_column") 3292 def _update_table_uuid_column( 3293 self, 3294 dmge: DataModelGraphExplorer, 3295 ) -> None: 3296 """Removes the `Uuid` column when present, and relpaces with an `Id` column 3297 Used to enable backwards compatability for manifests using the old `Uuid` convention 3298 3299 Args: 3300 dmge: DataModelGraphExplorer instance 3301 3302 Returns: 3303 None 3304 """ 3305 3306 # Get the columns of the schema 3307 schema = self.synapse_entity_tracker.get( 3308 synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False 3309 ) 3310 3311 cols = self.synStore.syn.getTableColumns(schema) 3312 3313 # Iterate through columns until `Uuid` column is found 3314 for col in cols: 3315 if col.name.lower() == "uuid": 3316 # See if schema has `Uuid` column specified 3317 try: 3318 uuid_col_in_schema = dmge.is_class_in_schema(col.name) 3319 except KeyError: 3320 uuid_col_in_schema = False 3321 3322 # If there is, then create a new `Id` column from scratch 3323 if uuid_col_in_schema: 3324 new_col = Column(columnType="STRING", maximumSize=64, name="Id") 3325 schema.addColumn(new_col) 3326 schema = self.synStore.syn.store(schema) 3327 # self.synapse_entity_tracker.add(synapse_id=schema.id, entity=schema) 3328 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3329 self.synapse_entity_tracker.remove(synapse_id=schema.id) 3330 # If there is not, then use the old `Uuid` column as a basis for the new `Id` column 3331 else: 3332 # Build ColumnModel that will be used for new column 3333 id_column = Column( 3334 name="Id", 3335 columnType="STRING", 3336 maximumSize=64, 3337 defaultValue=None, 3338 maximumListLength=1, 3339 ) 3340 new_col_response = self.synStore.syn.store(id_column) 3341 3342 # Define columnChange body 3343 columnChangeDict = { 3344 "concreteType": "org.sagebionetworks.repo.model.table.TableSchemaChangeRequest", 3345 "entityId": self.existingTableId, 3346 "changes": [ 3347 { 3348 "oldColumnId": col["id"], 3349 "newColumnId": new_col_response["id"], 3350 } 3351 ], 3352 } 3353 3354 self.synStore.syn._async_table_update( 3355 table=self.existingTableId, 3356 changes=[columnChangeDict], 3357 wait=False, 3358 ) 3359 break 3360 3361 return 3362 3363 @tracer.start_as_current_span("TableOperations::updateTable") 3364 def updateTable( 3365 self, 3366 update_col: str = "Id", 3367 ): 3368 """ 3369 Method to update an existing table with a new column 3370 3371 Args: 3372 updateCol: column to index the old and new tables on 3373 3374 Returns: 3375 existingTableId: synID of the already existing table that had its metadata replaced 3376 """ 3377 existing_table, existing_results = self.synStore.get_synapse_table( 3378 self.existingTableId 3379 ) 3380 3381 self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col) 3382 # store table with existing etag data and impose restrictions as appropriate 3383 table_result = self.synStore.syn.store( 3384 Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag), 3385 isRestricted=self.restrict, 3386 ) 3387 # We cannot store the Table to the `synapse_entity_tracker` because there is 3388 # not `Schema` on the table object. The above `.store()` function call would 3389 # also update the ETag of the entity within Synapse. Remove it from the tracker 3390 # and re-retrieve it later on if needed again. 3391 self.synapse_entity_tracker.remove(synapse_id=table_result.tableId) 3392 3393 return self.existingTableId
Object to hold functions for various table operations specific to the Synapse Asset Store.
Currently implement operations are: createTable: upload a manifest as a new table when none exist replaceTable: replace a metadata in a table from one manifest with metadata from another manifest updateTable: add a column to a table that already exists on synapse
Operations currently in development are: upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest
2988 def __init__( 2989 self, 2990 synStore: SynapseStorage, 2991 tableToLoad: pd.DataFrame = None, 2992 tableName: str = None, 2993 datasetId: str = None, 2994 existingTableId: str = None, 2995 restrict: bool = False, 2996 synapse_entity_tracker: SynapseEntityTracker = None, 2997 ): 2998 """ 2999 Class governing table operations (creation, replacement, upserts, updates) in schematic 3000 3001 tableToLoad: manifest formatted appropriately for the table 3002 tableName: name of the table to be uploaded 3003 datasetId: synID of the dataset for the manifest 3004 existingTableId: synId of the table currently exising on synapse (if there is one) 3005 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 3006 synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities 3007 3008 """ 3009 self.synStore = synStore 3010 self.tableToLoad = tableToLoad 3011 self.tableName = tableName 3012 self.datasetId = datasetId 3013 self.existingTableId = existingTableId 3014 self.restrict = restrict 3015 self.synapse_entity_tracker = synapse_entity_tracker or SynapseEntityTracker()
Class governing table operations (creation, replacement, upserts, updates) in schematic
tableToLoad: manifest formatted appropriately for the table tableName: name of the table to be uploaded datasetId: synID of the dataset for the manifest existingTableId: synId of the table currently exising on synapse (if there is one) restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
3017 @tracer.start_as_current_span("TableOperations::createTable") 3018 def createTable( 3019 self, 3020 columnTypeDict: dict = None, 3021 specifySchema: bool = True, 3022 ): 3023 """ 3024 Method to create a table from a metadata manifest and upload it to synapse 3025 3026 Args: 3027 columnTypeDict: dictionary schema for table columns: type, size, etc 3028 specifySchema: to specify a specific schema for the table format 3029 3030 Returns: 3031 table.schema.id: synID of the newly created table 3032 """ 3033 datasetEntity = self.synapse_entity_tracker.get( 3034 synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False 3035 ) 3036 datasetName = datasetEntity.name 3037 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3038 3039 if not self.tableName: 3040 self.tableName = datasetName + "table" 3041 datasetParentProject = self.synStore.getDatasetProject(self.datasetId) 3042 if specifySchema: 3043 if columnTypeDict == {}: 3044 logger.error("Did not provide a columnTypeDict.") 3045 # create list of columns: 3046 cols = [] 3047 for col in self.tableToLoad.columns: 3048 if col in table_schema_by_cname: 3049 col_type = table_schema_by_cname[col]["columnType"] 3050 max_size = ( 3051 table_schema_by_cname[col]["maximumSize"] 3052 if "maximumSize" in table_schema_by_cname[col].keys() 3053 else 100 3054 ) 3055 max_list_len = 250 3056 if max_size and max_list_len: 3057 cols.append( 3058 Column( 3059 name=col, 3060 columnType=col_type, 3061 maximumSize=max_size, 3062 maximumListLength=max_list_len, 3063 ) 3064 ) 3065 elif max_size: 3066 cols.append( 3067 Column(name=col, columnType=col_type, maximumSize=max_size) 3068 ) 3069 else: 3070 cols.append(Column(name=col, columnType=col_type)) 3071 else: 3072 # TODO add warning that the given col was not found and it's max size is set to 100 3073 cols.append(Column(name=col, columnType="STRING", maximumSize=100)) 3074 schema = Schema( 3075 name=self.tableName, columns=cols, parent=datasetParentProject 3076 ) 3077 table = Table(schema, self.tableToLoad) 3078 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3079 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3080 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3081 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3082 return table.schema.id 3083 else: 3084 # For just uploading the tables to synapse using default 3085 # column types. 3086 table = build_table(self.tableName, datasetParentProject, self.tableToLoad) 3087 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3088 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3089 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3090 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3091 return table.schema.id
Method to create a table from a metadata manifest and upload it to synapse
Arguments:
- columnTypeDict: dictionary schema for table columns: type, size, etc
- specifySchema: to specify a specific schema for the table format
Returns:
table.schema.id: synID of the newly created table
3093 @tracer.start_as_current_span("TableOperations::replaceTable") 3094 def replaceTable( 3095 self, 3096 specifySchema: bool = True, 3097 columnTypeDict: dict = None, 3098 ): 3099 """ 3100 Method to replace an existing table on synapse with metadata from a new manifest 3101 3102 Args: 3103 specifySchema: to infer a schema for the table format 3104 columnTypeDict: dictionary schema for table columns: type, size, etc 3105 3106 Returns: 3107 existingTableId: synID of the already existing table that had its metadata replaced 3108 """ 3109 datasetEntity = self.synapse_entity_tracker.get( 3110 synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False 3111 ) 3112 3113 datasetName = datasetEntity.name 3114 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3115 existing_table, existing_results = self.synStore.get_synapse_table( 3116 self.existingTableId 3117 ) 3118 # remove rows 3119 self.synStore.syn.delete(existing_results) 3120 # Data changes such as removing all rows causes the eTag to change. 3121 self.synapse_entity_tracker.remove(synapse_id=self.existingTableId) 3122 # wait for row deletion to finish on synapse before getting empty table 3123 sleep(10) 3124 3125 # removes all current columns 3126 current_table = self.synapse_entity_tracker.get( 3127 synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False 3128 ) 3129 3130 current_columns = self.synStore.syn.getTableColumns(current_table) 3131 for col in current_columns: 3132 current_table.removeColumn(col) 3133 3134 if not self.tableName: 3135 self.tableName = datasetName + "table" 3136 3137 # Process columns according to manifest entries 3138 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3139 datasetParentProject = self.synStore.getDatasetProject(self.datasetId) 3140 if specifySchema: 3141 if columnTypeDict == {}: 3142 logger.error("Did not provide a columnTypeDict.") 3143 # create list of columns: 3144 cols = [] 3145 3146 for col in self.tableToLoad.columns: 3147 if col in table_schema_by_cname: 3148 col_type = table_schema_by_cname[col]["columnType"] 3149 max_size = ( 3150 table_schema_by_cname[col]["maximumSize"] 3151 if "maximumSize" in table_schema_by_cname[col].keys() 3152 else 100 3153 ) 3154 max_list_len = 250 3155 if max_size and max_list_len: 3156 cols.append( 3157 Column( 3158 name=col, 3159 columnType=col_type, 3160 maximumSize=max_size, 3161 maximumListLength=max_list_len, 3162 ) 3163 ) 3164 elif max_size: 3165 cols.append( 3166 Column(name=col, columnType=col_type, maximumSize=max_size) 3167 ) 3168 else: 3169 cols.append(Column(name=col, columnType=col_type)) 3170 else: 3171 # TODO add warning that the given col was not found and it's max size is set to 100 3172 cols.append(Column(name=col, columnType="STRING", maximumSize=100)) 3173 3174 # adds new columns to schema 3175 for col in cols: 3176 current_table.addColumn(col) 3177 table_result = self.synStore.syn.store( 3178 current_table, isRestricted=self.restrict 3179 ) 3180 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3181 # self.synapse_entity_tracker.add(synapse_id=table_result.schema.id, entity=table_result.schema) 3182 self.synapse_entity_tracker.remove(synapse_id=table_result.id) 3183 3184 # wait for synapse store to finish 3185 sleep(1) 3186 3187 # build schema and table from columns and store with necessary restrictions 3188 schema = Schema( 3189 name=self.tableName, columns=cols, parent=datasetParentProject 3190 ) 3191 schema.id = self.existingTableId 3192 table = Table(schema, self.tableToLoad, etag=existing_results.etag) 3193 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3194 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3195 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3196 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3197 else: 3198 logging.error("Must specify a schema for table replacements") 3199 3200 # remove system metadata from manifest 3201 existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True) 3202 return self.existingTableId
Method to replace an existing table on synapse with metadata from a new manifest
Arguments:
- specifySchema: to infer a schema for the table format
- columnTypeDict: dictionary schema for table columns: type, size, etc
Returns:
existingTableId: synID of the already existing table that had its metadata replaced
3248 @tracer.start_as_current_span("TableOperations::upsertTable") 3249 def upsertTable(self, dmge: DataModelGraphExplorer): 3250 """ 3251 Method to upsert rows from a new manifest into an existing table on synapse 3252 For upsert functionality to work, primary keys must follow the naming convention of <componenet>_id 3253 `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality. 3254 Currently it is required to use -dl/--use_display_label with table upserts. 3255 3256 3257 Args: 3258 dmge: DataModelGraphExplorer instance 3259 3260 Returns: 3261 existingTableId: synID of the already existing table that had its metadata replaced 3262 """ 3263 3264 authtoken = self._get_auth_token() 3265 3266 synapseDB = SynapseDatabase( 3267 auth_token=authtoken, 3268 project_id=self.synStore.getDatasetProject(self.datasetId), 3269 syn=self.synStore.syn, 3270 synapse_entity_tracker=self.synapse_entity_tracker, 3271 ) 3272 3273 try: 3274 # Try performing upsert 3275 synapseDB.upsert_table_rows( 3276 table_name=self.tableName, data=self.tableToLoad 3277 ) 3278 except SynapseHTTPError as ex: 3279 # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload 3280 if "Id is not a valid column name or id" in str(ex): 3281 self._update_table_uuid_column(dmge) 3282 synapseDB.upsert_table_rows( 3283 table_name=self.tableName, data=self.tableToLoad 3284 ) 3285 # Raise if other error 3286 else: 3287 raise ex 3288 3289 return self.existingTableId
Method to upsert rows from a new manifest into an existing table on synapse
For upsert functionality to work, primary keys must follow the naming convention of -tm upsert
should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality.
Currently it is required to use -dl/--use_display_label with table upserts.
Arguments:
- dmge: DataModelGraphExplorer instance
Returns:
existingTableId: synID of the already existing table that had its metadata replaced
3363 @tracer.start_as_current_span("TableOperations::updateTable") 3364 def updateTable( 3365 self, 3366 update_col: str = "Id", 3367 ): 3368 """ 3369 Method to update an existing table with a new column 3370 3371 Args: 3372 updateCol: column to index the old and new tables on 3373 3374 Returns: 3375 existingTableId: synID of the already existing table that had its metadata replaced 3376 """ 3377 existing_table, existing_results = self.synStore.get_synapse_table( 3378 self.existingTableId 3379 ) 3380 3381 self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col) 3382 # store table with existing etag data and impose restrictions as appropriate 3383 table_result = self.synStore.syn.store( 3384 Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag), 3385 isRestricted=self.restrict, 3386 ) 3387 # We cannot store the Table to the `synapse_entity_tracker` because there is 3388 # not `Schema` on the table object. The above `.store()` function call would 3389 # also update the ETag of the entity within Synapse. Remove it from the tracker 3390 # and re-retrieve it later on if needed again. 3391 self.synapse_entity_tracker.remove(synapse_id=table_result.tableId) 3392 3393 return self.existingTableId
Method to update an existing table with a new column
Arguments:
- updateCol: column to index the old and new tables on
Returns:
existingTableId: synID of the already existing table that had its metadata replaced
3396class DatasetFileView: 3397 """Helper class to create temporary dataset file views. 3398 This class can be used in conjunction with a 'with' statement. 3399 This will ensure that the file view is deleted automatically. 3400 See SynapseStorage.getDatasetAnnotationsBatch for example usage. 3401 """ 3402 3403 def __init__( 3404 self, 3405 datasetId: str, 3406 synapse: Synapse, 3407 name: str = None, 3408 temporary: bool = True, 3409 parentId: str = None, 3410 ) -> None: 3411 """Create a file view scoped to a dataset folder. 3412 3413 Args: 3414 datasetId (str): Synapse ID for a dataset folder/project. 3415 synapse (Synapse): Used for Synapse requests. 3416 name (str): Name of the file view (temporary or not). 3417 temporary (bool): Whether to delete the file view on exit 3418 of either a 'with' statement or Python entirely. 3419 parentId (str, optional): Synapse ID specifying where to 3420 store the file view. Defaults to datasetId. 3421 """ 3422 3423 self.datasetId = datasetId 3424 self.synapse = synapse 3425 self.is_temporary = temporary 3426 3427 if name is None: 3428 self.name = f"schematic annotation file view for {self.datasetId}" 3429 3430 if self.is_temporary: 3431 uid = secrets.token_urlsafe(5) 3432 self.name = f"{self.name} - UID {uid}" 3433 3434 # TODO: Allow a DCC admin to configure a "universal parent" 3435 # Such as a Synapse project writeable by everyone. 3436 self.parentId = datasetId if parentId is None else parentId 3437 3438 # TODO: Create local sharing setting to hide from everyone else 3439 view_schema = EntityViewSchema( 3440 name=self.name, 3441 parent=self.parentId, 3442 scopes=self.datasetId, 3443 includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER], 3444 addDefaultViewColumns=False, 3445 addAnnotationColumns=True, 3446 ) 3447 3448 # TODO: Handle failure due to insufficient permissions by 3449 # creating a temporary new project to store view 3450 self.view_schema = self.synapse.store(view_schema) 3451 3452 # These are filled in after calling `self.query()` 3453 self.results = None 3454 self.table = None 3455 3456 # Ensure deletion of the file view (last resort) 3457 if self.is_temporary: 3458 atexit.register(self.delete) 3459 3460 def __enter__(self): 3461 """Return file view when entering 'with' statement.""" 3462 return self 3463 3464 def __exit__(self, exc_type, exc_value, traceback): 3465 """Delete file view when exiting 'with' statement.""" 3466 if self.is_temporary: 3467 self.delete() 3468 3469 def delete(self): 3470 """Delete the file view on Synapse without deleting local table.""" 3471 if self.view_schema is not None: 3472 self.synapse.delete(self.view_schema) 3473 self.view_schema = None 3474 3475 def query(self, tidy=True, force=False): 3476 """Retrieve file view as a data frame (raw format sans index).""" 3477 if self.table is None or force: 3478 fileview_id = self.view_schema["id"] 3479 self.results = self.synapse.tableQuery(f"select * from {fileview_id}") 3480 self.table = self.results.asDataFrame( 3481 rowIdAndVersionInIndex=False, 3482 na_values=STR_NA_VALUES_FILTERED, 3483 keep_default_na=False, 3484 ) 3485 if tidy: 3486 self.tidy_table() 3487 return self.table 3488 3489 def tidy_table(self): 3490 """Convert raw file view data frame into more usable format.""" 3491 assert self.table is not None, "Must call `self.query()` first." 3492 self._fix_default_columns() 3493 self._fix_list_columns() 3494 self._fix_int_columns() 3495 return self.table 3496 3497 def _fix_default_columns(self): 3498 """Rename default columns to match schematic expectations.""" 3499 3500 # Drop ROW_VERSION column if present 3501 if "ROW_VERSION" in self.table: 3502 del self.table["ROW_VERSION"] 3503 3504 # Rename id column to entityId and set as data frame index 3505 if "ROW_ID" in self.table: 3506 self.table["entityId"] = "syn" + self.table["ROW_ID"].astype(str) 3507 self.table = self.table.set_index("entityId", drop=False) 3508 del self.table["ROW_ID"] 3509 3510 # Rename ROW_ETAG column to eTag and place at end of data frame 3511 if "ROW_ETAG" in self.table: 3512 row_etags = self.table.pop("ROW_ETAG") 3513 3514 # eTag column may already present if users annotated data without submitting manifest 3515 # we're only concerned with the new values and not the existing ones 3516 if "eTag" in self.table: 3517 del self.table["eTag"] 3518 3519 self.table.insert(len(self.table.columns), "eTag", row_etags) 3520 3521 return self.table 3522 3523 def _get_columns_of_type(self, types): 3524 """Helper function to get list of columns of a given type(s).""" 3525 matching_columns = [] 3526 for header in self.results.headers: 3527 if header.columnType in types: 3528 matching_columns.append(header.name) 3529 return matching_columns 3530 3531 def _fix_list_columns(self): 3532 """Fix formatting of list-columns.""" 3533 list_types = {"STRING_LIST", "INTEGER_LIST", "BOOLEAN_LIST"} 3534 list_columns = self._get_columns_of_type(list_types) 3535 for col in list_columns: 3536 self.table[col] = self.table[col].apply(lambda x: ", ".join(x)) 3537 return self.table 3538 3539 def _fix_int_columns(self): 3540 """Ensure that integer-columns are actually integers.""" 3541 int_columns = self._get_columns_of_type({"INTEGER"}) 3542 for col in int_columns: 3543 # Coercing to string because NaN is a floating point value 3544 # and cannot exist alongside integers in a column 3545 def to_int_fn(x): 3546 return "" if np.isnan(x) else str(int(x)) 3547 3548 self.table[col] = self.table[col].apply(to_int_fn) 3549 return self.table
Helper class to create temporary dataset file views. This class can be used in conjunction with a 'with' statement. This will ensure that the file view is deleted automatically. See SynapseStorage.getDatasetAnnotationsBatch for example usage.
3403 def __init__( 3404 self, 3405 datasetId: str, 3406 synapse: Synapse, 3407 name: str = None, 3408 temporary: bool = True, 3409 parentId: str = None, 3410 ) -> None: 3411 """Create a file view scoped to a dataset folder. 3412 3413 Args: 3414 datasetId (str): Synapse ID for a dataset folder/project. 3415 synapse (Synapse): Used for Synapse requests. 3416 name (str): Name of the file view (temporary or not). 3417 temporary (bool): Whether to delete the file view on exit 3418 of either a 'with' statement or Python entirely. 3419 parentId (str, optional): Synapse ID specifying where to 3420 store the file view. Defaults to datasetId. 3421 """ 3422 3423 self.datasetId = datasetId 3424 self.synapse = synapse 3425 self.is_temporary = temporary 3426 3427 if name is None: 3428 self.name = f"schematic annotation file view for {self.datasetId}" 3429 3430 if self.is_temporary: 3431 uid = secrets.token_urlsafe(5) 3432 self.name = f"{self.name} - UID {uid}" 3433 3434 # TODO: Allow a DCC admin to configure a "universal parent" 3435 # Such as a Synapse project writeable by everyone. 3436 self.parentId = datasetId if parentId is None else parentId 3437 3438 # TODO: Create local sharing setting to hide from everyone else 3439 view_schema = EntityViewSchema( 3440 name=self.name, 3441 parent=self.parentId, 3442 scopes=self.datasetId, 3443 includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER], 3444 addDefaultViewColumns=False, 3445 addAnnotationColumns=True, 3446 ) 3447 3448 # TODO: Handle failure due to insufficient permissions by 3449 # creating a temporary new project to store view 3450 self.view_schema = self.synapse.store(view_schema) 3451 3452 # These are filled in after calling `self.query()` 3453 self.results = None 3454 self.table = None 3455 3456 # Ensure deletion of the file view (last resort) 3457 if self.is_temporary: 3458 atexit.register(self.delete)
Create a file view scoped to a dataset folder.
Arguments:
- datasetId (str): Synapse ID for a dataset folder/project.
- synapse (Synapse): Used for Synapse requests.
- name (str): Name of the file view (temporary or not).
- temporary (bool): Whether to delete the file view on exit of either a 'with' statement or Python entirely.
- parentId (str, optional): Synapse ID specifying where to store the file view. Defaults to datasetId.
3469 def delete(self): 3470 """Delete the file view on Synapse without deleting local table.""" 3471 if self.view_schema is not None: 3472 self.synapse.delete(self.view_schema) 3473 self.view_schema = None
Delete the file view on Synapse without deleting local table.
3475 def query(self, tidy=True, force=False): 3476 """Retrieve file view as a data frame (raw format sans index).""" 3477 if self.table is None or force: 3478 fileview_id = self.view_schema["id"] 3479 self.results = self.synapse.tableQuery(f"select * from {fileview_id}") 3480 self.table = self.results.asDataFrame( 3481 rowIdAndVersionInIndex=False, 3482 na_values=STR_NA_VALUES_FILTERED, 3483 keep_default_na=False, 3484 ) 3485 if tidy: 3486 self.tidy_table() 3487 return self.table
Retrieve file view as a data frame (raw format sans index).
3489 def tidy_table(self): 3490 """Convert raw file view data frame into more usable format.""" 3491 assert self.table is not None, "Must call `self.query()` first." 3492 self._fix_default_columns() 3493 self._fix_list_columns() 3494 self._fix_int_columns() 3495 return self.table
Convert raw file view data frame into more usable format.