schematic.store.synapse
Synapse storage class
1"""Synapse storage class""" 2 3import asyncio 4import atexit 5import logging 6import os 7import re 8import secrets 9import shutil 10import time 11import uuid # used to generate unique names for entities 12from copy import deepcopy 13from dataclasses import dataclass, field 14from time import sleep 15 16# allows specifying explicit variable types 17from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union 18 19import numpy as np 20import pandas as pd 21import synapseclient 22from opentelemetry import trace 23from synapseclient import Annotations as OldAnnotations 24from synapseclient import ( 25 Column, 26 EntityViewSchema, 27 EntityViewType, 28 File, 29 Folder, 30 Schema, 31 Synapse, 32 Table, 33 as_table_columns, 34) 35from synapseclient.annotations import _convert_to_annotations_list 36from synapseclient.api import get_config_file, get_entity_id_bundle2 37from synapseclient.core.constants.concrete_types import PROJECT_ENTITY 38from synapseclient.core.exceptions import ( 39 SynapseAuthenticationError, 40 SynapseHTTPError, 41 SynapseUnmetAccessRestrictions, 42) 43from synapseclient.models.annotations import Annotations 44from synapseclient.table import CsvFileTable, Schema, build_table 45from tenacity import ( 46 retry, 47 retry_if_exception_type, 48 stop_after_attempt, 49 wait_chain, 50 wait_fixed, 51) 52 53from schematic.configuration.configuration import CONFIG 54from schematic.exceptions import AccessCredentialsError 55from schematic.schemas.data_model_graph import DataModelGraphExplorer 56from schematic.store.base import BaseStorage 57from schematic.store.database.synapse_database import SynapseDatabase 58from schematic.store.synapse_tracker import SynapseEntityTracker 59from schematic.utils.df_utils import ( 60 STR_NA_VALUES_FILTERED, 61 col_in_dataframe, 62 load_df, 63 update_df, 64) 65 66# entity_type_mapping, get_dir_size, create_temp_folder, check_synapse_cache_size, and clear_synapse_cache functions are used for AWS deployment 67# Please do not remove these import statements 68from schematic.utils.general import ( 69 check_synapse_cache_size, 70 clear_synapse_cache, 71 create_temp_folder, 72 entity_type_mapping, 73 get_dir_size, 74) 75from schematic.utils.io_utils import cleanup_temporary_storage 76from schematic.utils.schema_utils import get_class_label_from_display_name 77from schematic.utils.validate_utils import comma_separated_list_regex, rule_in_rule_list 78 79logger = logging.getLogger("Synapse storage") 80 81tracer = trace.get_tracer("Schematic") 82 83 84@dataclass 85class ManifestDownload(object): 86 """ 87 syn: an object of type synapseclient. 88 manifest_id: id of a manifest 89 synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities 90 """ 91 92 syn: synapseclient.Synapse 93 manifest_id: str 94 synapse_entity_tracker: SynapseEntityTracker = field( 95 default_factory=SynapseEntityTracker 96 ) 97 98 def _download_manifest_to_folder(self, use_temporary_folder: bool = True) -> File: 99 """ 100 Try downloading a manifest to a specific folder (temporary or not). When the 101 `use_temporary_folder` is set to True, the manifest will be downloaded to a 102 temporary folder. This is useful for when the code is running as an API server 103 where multiple requests are being made at the same time. This will prevent 104 multiple requests from overwriting the same manifest file. When the 105 `use_temporary_folder` is set to False, the manifest will be downloaded to the 106 default manifest folder. 107 108 Args: 109 use_temporary_folder: boolean argument indicating if a temporary folder 110 should be used to store the manifest file. This is useful when running 111 this code as an API server where multiple requests could be made at the 112 same time. This is set to False when the code is being used from the 113 CLI. Defaults to True. 114 115 Return: 116 manifest_data: A Synapse file entity of the downloaded manifest 117 """ 118 manifest_data = self.synapse_entity_tracker.get( 119 synapse_id=self.manifest_id, 120 syn=self.syn, 121 download_file=False, 122 retrieve_if_not_present=False, 123 ) 124 current_span = trace.get_current_span() 125 if ( 126 manifest_data 127 and (file_handle := manifest_data.get("_file_handle", None)) 128 and current_span.is_recording() 129 ): 130 current_span.set_attribute( 131 "schematic.manifest_size", file_handle.get("contentSize", 0) 132 ) 133 134 if manifest_data and manifest_data.path: 135 return manifest_data 136 137 if "SECRETS_MANAGER_SECRETS" in os.environ: 138 temporary_manifest_storage = "/var/tmp/temp_manifest_download" 139 cleanup_temporary_storage( 140 temporary_manifest_storage, time_delta_seconds=3600 141 ) 142 # create a new directory to store manifest 143 if not os.path.exists(temporary_manifest_storage): 144 os.mkdir(temporary_manifest_storage) 145 # create temporary folders for storing manifests 146 download_location = create_temp_folder( 147 path=temporary_manifest_storage, 148 prefix=f"{self.manifest_id}-{time.time()}-", 149 ) 150 else: 151 if use_temporary_folder: 152 download_location = create_temp_folder( 153 path=CONFIG.manifest_folder, 154 prefix=f"{self.manifest_id}-{time.time()}-", 155 ) 156 else: 157 download_location = CONFIG.manifest_folder 158 159 manifest_data = self.synapse_entity_tracker.get( 160 synapse_id=self.manifest_id, 161 syn=self.syn, 162 download_file=True, 163 retrieve_if_not_present=True, 164 download_location=download_location, 165 ) 166 167 # This is doing a rename of the downloaded file. The reason this is important 168 # is that if we are re-using a file that was previously downloaded, but the 169 # file had been renamed. The file downloaded from the Synapse client is just 170 # a direct copy of that renamed file. This code will set the name of the file 171 # to the original name that was used to download the file. Note: An MD5 checksum 172 # of the file will still be performed so if the file has changed, it will be 173 # downloaded again. 174 filename = manifest_data._file_handle.fileName 175 if filename != os.path.basename(manifest_data.path): 176 parent_folder = os.path.dirname(manifest_data.path) 177 manifest_original_name_and_path = os.path.join(parent_folder, filename) 178 179 self.syn.cache.remove( 180 file_handle_id=manifest_data.dataFileHandleId, path=manifest_data.path 181 ) 182 os.rename(manifest_data.path, manifest_original_name_and_path) 183 manifest_data.path = manifest_original_name_and_path 184 self.syn.cache.add( 185 file_handle_id=manifest_data.dataFileHandleId, 186 path=manifest_original_name_and_path, 187 md5=manifest_data._file_handle.contentMd5, 188 ) 189 190 return manifest_data 191 192 def _entity_type_checking(self) -> str: 193 """ 194 check the entity type of the id that needs to be downloaded 195 Return: 196 if the entity type is wrong, raise an error 197 """ 198 # check the type of entity 199 entity_type = entity_type_mapping( 200 syn=self.syn, 201 entity_id=self.manifest_id, 202 synapse_entity_tracker=self.synapse_entity_tracker, 203 ) 204 if entity_type != "file": 205 logger.error( 206 f"You are using entity type: {entity_type}. Please provide a file ID" 207 ) 208 209 def download_manifest( 210 self, 211 newManifestName: str = "", 212 manifest_df: pd.DataFrame = pd.DataFrame(), 213 use_temporary_folder: bool = True, 214 ) -> Union[str, File]: 215 """ 216 Download a manifest based on a given manifest id. 217 Args: 218 newManifestName(optional): new name of a manifest that gets downloaded. 219 manifest_df(optional): a dataframe containing name and id of manifests in a given asset view 220 Return: 221 manifest_data: synapse entity file object 222 """ 223 224 # enables retrying if user does not have access to uncensored manifest 225 # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location 226 manifest_data = "" 227 228 # check entity type 229 self._entity_type_checking() 230 231 # download a manifest 232 try: 233 manifest_data = self._download_manifest_to_folder( 234 use_temporary_folder=use_temporary_folder 235 ) 236 except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError): 237 # if there's an error getting an uncensored manifest, try getting the censored manifest 238 if not manifest_df.empty: 239 censored_regex = re.compile(".*censored.*") 240 censored = manifest_df["name"].str.contains(censored_regex) 241 new_manifest_id = manifest_df[censored]["id"][0] 242 self.manifest_id = new_manifest_id 243 try: 244 manifest_data = self._download_manifest_to_folder( 245 use_temporary_folder=use_temporary_folder 246 ) 247 except ( 248 SynapseUnmetAccessRestrictions, 249 SynapseAuthenticationError, 250 ) as e: 251 raise PermissionError( 252 "You don't have access to censored and uncensored manifests in this dataset." 253 ) from e 254 else: 255 logger.error( 256 f"You don't have access to the requested resource: {self.manifest_id}" 257 ) 258 259 if newManifestName and os.path.exists(manifest_data.get("path")): 260 # Rename the file we just made to the new name 261 new_manifest_filename = newManifestName + ".csv" 262 263 # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest. 264 parent_folder = os.path.dirname(manifest_data.get("path")) 265 266 new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename) 267 268 # Copy file to new location. The purpose of using a copy instead of a rename 269 # is to avoid any potential issues with the file being used in another 270 # process. This avoids any potential race or code cocurrency conditions. 271 shutil.copyfile(src=manifest_data["path"], dst=new_manifest_path_name) 272 273 # Adding this to cache will allow us to re-use the already downloaded 274 # manifest file for up to 1 hour. 275 self.syn.cache.add( 276 file_handle_id=manifest_data.dataFileHandleId, 277 path=new_manifest_path_name, 278 md5=manifest_data._file_handle.contentMd5, 279 ) 280 281 # Update file names/paths in manifest_data 282 manifest_data["name"] = new_manifest_filename 283 manifest_data["filename"] = new_manifest_filename 284 manifest_data["path"] = new_manifest_path_name 285 286 return manifest_data 287 288 289class SynapseStorage(BaseStorage): 290 """Implementation of Storage interface for datasets/files stored on Synapse. 291 Provides utilities to list files in a specific project; update files annotations, create fileviews, etc. 292 293 TODO: Need to define the interface and rename and/or refactor some of the methods below. 294 """ 295 296 @tracer.start_as_current_span("SynapseStorage::__init__") 297 def __init__( 298 self, 299 token: Optional[str] = None, # optional parameter retrieved from browser cookie 300 access_token: Optional[str] = None, 301 project_scope: Optional[list] = None, 302 synapse_cache_path: Optional[str] = None, 303 perform_query: Optional[bool] = True, 304 columns: Optional[list] = None, 305 where_clauses: Optional[list] = None, 306 ) -> None: 307 """Initializes a SynapseStorage object. 308 309 Args: 310 token (Optional[str], optional): 311 Optional token parameter as found in browser cookie upon login to synapse. 312 Defaults to None. 313 access_token (Optional[list], optional): 314 Optional access token (personal or oauth). 315 Defaults to None. 316 project_scope (Optional[list], optional): Defaults to None. 317 synapse_cache_path (Optional[str], optional): 318 Location of synapse cache. 319 Defaults to None. 320 TODO: 321 Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how `query_fileview` is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands. 322 """ 323 self.syn = self.login(synapse_cache_path, access_token) 324 current_span = trace.get_current_span() 325 if current_span.is_recording(): 326 current_span.set_attribute("user.id", self.syn.credentials.owner_id) 327 self.project_scope = project_scope 328 self.storageFileview = CONFIG.synapse_master_fileview_id 329 self.manifest = CONFIG.synapse_manifest_basename 330 self.root_synapse_cache = self.syn.cache.cache_root_dir 331 self.synapse_entity_tracker = SynapseEntityTracker() 332 if perform_query: 333 self.query_fileview(columns=columns, where_clauses=where_clauses) 334 335 # TODO: When moving this over to a regular cron-job the following logic should be 336 # out of `manifest_download`: 337 # if "SECRETS_MANAGER_SECRETS" in os.environ: 338 # temporary_manifest_storage = "/var/tmp/temp_manifest_download" 339 # cleanup_temporary_storage(temporary_manifest_storage, time_delta_seconds=3600) 340 @tracer.start_as_current_span("SynapseStorage::_purge_synapse_cache") 341 def _purge_synapse_cache( 342 self, maximum_storage_allowed_cache_gb: int = 1, minute_buffer: int = 15 343 ) -> None: 344 """ 345 Purge synapse cache if it exceeds a certain size. Default to 1GB. 346 Args: 347 maximum_storage_allowed_cache_gb (int): the maximum storage allowed 348 before purging cache. Default is 1 GB. 349 minute_buffer (int): All files created this amount of time or older will be deleted 350 """ 351 # try clearing the cache 352 # scan a directory and check size of files 353 if os.path.exists(self.root_synapse_cache): 354 maximum_storage_allowed_cache_bytes = maximum_storage_allowed_cache_gb * ( 355 1024**3 356 ) 357 nbytes = get_dir_size(self.root_synapse_cache) 358 dir_size_bytes = check_synapse_cache_size(directory=self.root_synapse_cache) 359 # if 1 GB has already been taken, purge cache before 15 min 360 if dir_size_bytes >= maximum_storage_allowed_cache_bytes: 361 num_of_deleted_files = clear_synapse_cache( 362 self.syn.cache, minutes=minute_buffer 363 ) 364 logger.info( 365 f"{num_of_deleted_files} files have been deleted from {self.root_synapse_cache}" 366 ) 367 else: 368 # on AWS, OS takes around 14-17% of our ephemeral storage (20GiB) 369 # instead of guessing how much space that we left, print out .synapseCache here 370 logger.info(f"the total size of .synapseCache is: {nbytes} bytes") 371 372 @tracer.start_as_current_span("SynapseStorage::query_fileview") 373 def query_fileview( 374 self, 375 columns: Optional[list] = None, 376 where_clauses: Optional[list] = None, 377 force_requery: Optional[bool] = False, 378 ) -> None: 379 """ 380 Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute. 381 Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes. 382 Args: 383 columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns. 384 where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None. 385 force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False. 386 """ 387 self._purge_synapse_cache() 388 389 # Initialize to assume that the new fileview query will be different from what may already be stored. Initializes to True because generally one will not have already been performed 390 self.new_query_different = True 391 392 # If a query has already been performed, store the query 393 previous_query_built = hasattr(self, "fileview_query") 394 if previous_query_built: 395 previous_query = self.fileview_query 396 397 # Build a query with the current given parameters and check to see if it is different from the previous 398 self._build_query(columns=columns, where_clauses=where_clauses) 399 if previous_query_built: 400 self.new_query_different = self.fileview_query != previous_query 401 402 # Only perform the query if it is different from the previous query or we are forcing new results to be retrieved 403 if self.new_query_different or force_requery: 404 try: 405 self.storageFileviewTable = self.syn.tableQuery( 406 query=self.fileview_query, 407 ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) 408 except SynapseHTTPError as exc: 409 exception_text = str(exc) 410 if "Unknown column path" in exception_text: 411 raise ValueError( 412 "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation." 413 ) 414 elif "Unknown column" in exception_text: 415 missing_column = exception_text.split("Unknown column ")[-1] 416 raise ValueError( 417 f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview." 418 ) 419 else: 420 raise AccessCredentialsError(self.storageFileview) 421 422 @staticmethod 423 def build_clause_from_dataset_id( 424 dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None 425 ) -> str: 426 """ 427 Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized. 428 Args: 429 dataset_id: Synapse ID of a dataset that should be used to limit the query 430 dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query 431 Returns: 432 clause for the query or an empty string if no dataset ID is provided 433 """ 434 # Calling this method without specifying synIDs will complete but will not scope the view 435 if (not dataset_id) and (not dataset_folder_list): 436 return "" 437 438 # This will be used to gather files under a dataset recursively with a fileview query instead of walking 439 if dataset_folder_list: 440 search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list) 441 return f"parentId IN ({search_folders})" 442 443 # `dataset_id` should be provided when all files are stored directly under the dataset folder 444 return f"parentId='{dataset_id}'" 445 446 def _build_query( 447 self, columns: Optional[list] = None, where_clauses: Optional[list] = None 448 ): 449 """ 450 Method to build a query for Synapse FileViews 451 Args: 452 columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns. 453 where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None. 454 self.storageFileview (str): Synapse FileView ID 455 self.project_scope (Optional[list], optional): List of project IDs to be used to scope the query. Defaults to None. 456 Gets added to where_clauses, more included for backwards compatability and as a more user friendly way of subsetting the view in a simple way. 457 """ 458 if columns is None: 459 columns = [] 460 if where_clauses is None: 461 where_clauses = [] 462 463 if self.project_scope: 464 project_scope_clause = f"projectId IN {tuple(self.project_scope + [''])}" 465 where_clauses.append(project_scope_clause) 466 467 if where_clauses: 468 where_clauses = " AND ".join(where_clauses) 469 where_clauses = f"WHERE {where_clauses} ;" 470 else: 471 where_clauses = ";" 472 473 if columns: 474 columns = ",".join(columns) 475 else: 476 columns = "*" 477 478 self.fileview_query = ( 479 f"SELECT {columns} FROM {self.storageFileview} {where_clauses}" 480 ) 481 482 return 483 484 @staticmethod 485 @tracer.start_as_current_span("SynapseStorage::login") 486 def login( 487 synapse_cache_path: Optional[str] = None, 488 access_token: Optional[str] = None, 489 ) -> synapseclient.Synapse: 490 """Login to Synapse 491 492 Args: 493 access_token (Optional[str], optional): A synapse access token. Defaults to None. 494 synapse_cache_path (Optional[str]): location of synapse cache 495 496 Raises: 497 ValueError: If unable to loging with access token 498 499 Returns: 500 synapseclient.Synapse: A Synapse object that is logged in 501 """ 502 # If no token is provided, try retrieving access token from environment 503 if not access_token: 504 access_token = os.getenv("SYNAPSE_ACCESS_TOKEN") 505 506 # login using a token 507 if access_token: 508 try: 509 syn = synapseclient.Synapse( 510 cache_root_dir=synapse_cache_path, 511 debug=False, 512 skip_checks=True, 513 cache_client=False, 514 ) 515 syn.login(authToken=access_token, silent=True) 516 current_span = trace.get_current_span() 517 if current_span.is_recording(): 518 current_span.set_attribute("user.id", syn.credentials.owner_id) 519 except SynapseHTTPError as exc: 520 raise ValueError( 521 "No access to resources. Please make sure that your token is correct" 522 ) from exc 523 else: 524 # login using synapse credentials provided by user in .synapseConfig (default) file 525 syn = synapseclient.Synapse( 526 configPath=CONFIG.synapse_configuration_path, 527 cache_root_dir=synapse_cache_path, 528 debug=False, 529 skip_checks=True, 530 cache_client=False, 531 ) 532 syn.login(silent=True) 533 current_span = trace.get_current_span() 534 if current_span.is_recording(): 535 current_span.set_attribute("user.id", syn.credentials.owner_id) 536 return syn 537 538 def missing_entity_handler(method): 539 def wrapper(*args, **kwargs): 540 try: 541 return method(*args, **kwargs) 542 except SynapseHTTPError as ex: 543 str_message = str(ex).replace("\n", "") 544 if "trash" in str_message or "does not exist" in str_message: 545 logging.warning(str_message) 546 return None 547 else: 548 raise ex 549 550 return wrapper 551 552 def async_missing_entity_handler(method): 553 """Decorator to handle missing entities in async methods.""" 554 555 async def wrapper(*args: Any, **kwargs: Any) -> Any: 556 try: 557 return await method(*args, **kwargs) 558 except SynapseHTTPError as ex: 559 str_message = str(ex).replace("\n", "") 560 if "trash" in str_message or "does not exist" in str_message: 561 logging.warning(str_message) 562 return None 563 else: 564 raise ex 565 566 return wrapper 567 568 def getStorageFileviewTable(self): 569 """Returns the storageFileviewTable obtained during initialization.""" 570 return self.storageFileviewTable 571 572 def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]: 573 """Gets the paginated results of the REST call to Synapse to check what projects the current user has access to. 574 575 Args: 576 currentUserId: synapse id for the user whose projects we want to get. 577 578 Returns: 579 A dictionary with a next page token and the results. 580 """ 581 all_results = self.syn.restGET( 582 "/projects/user/{principalId}".format(principalId=currentUserId) 583 ) 584 585 while ( 586 "nextPageToken" in all_results 587 ): # iterate over next page token in results while there is any 588 results_token = self.syn.restGET( 589 "/projects/user/{principalId}?nextPageToken={nextPageToken}".format( 590 principalId=currentUserId, 591 nextPageToken=all_results["nextPageToken"], 592 ) 593 ) 594 all_results["results"].extend(results_token["results"]) 595 596 if "nextPageToken" in results_token: 597 all_results["nextPageToken"] = results_token["nextPageToken"] 598 else: 599 del all_results["nextPageToken"] 600 601 return all_results 602 603 @tracer.start_as_current_span("SynapseStorage::getStorageProjects") 604 def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]: 605 """Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute. 606 607 Returns: 608 A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName). 609 """ 610 611 # get the set of all storage Synapse project accessible for this pipeline 612 storageProjects = self.storageFileviewTable["projectId"].unique() 613 614 # get the set of storage Synapse project accessible for this user 615 # get a list of projects from Synapse 616 current_user_project_headers = self.synapse_entity_tracker.get_project_headers( 617 current_user_id=self.syn.credentials.owner_id, syn=self.syn 618 ) 619 project_id_to_name_dict = {} 620 current_user_projects = [] 621 for project_header in current_user_project_headers: 622 project_id_to_name_dict[project_header.get("id")] = project_header.get( 623 "name" 624 ) 625 current_user_projects.append(project_header.get("id")) 626 627 # find set of user projects that are also in this pipeline's storage projects set 628 storageProjects = list(set(storageProjects) & set(current_user_projects)) 629 630 # Limit projects to scope if specified 631 if project_scope: 632 storageProjects = list(set(storageProjects) & set(project_scope)) 633 634 if not storageProjects: 635 raise Warning( 636 f"There are no projects that the user has access to that match the criteria of the specified project scope: {project_scope}" 637 ) 638 639 # prepare a return list of project IDs and names 640 projects = [] 641 for projectId in storageProjects: 642 project_name_from_project_header = project_id_to_name_dict.get(projectId) 643 projects.append((projectId, project_name_from_project_header)) 644 645 sorted_projects_list = sorted(projects, key=lambda tup: tup[0]) 646 647 return sorted_projects_list 648 649 @tracer.start_as_current_span("SynapseStorage::getStorageDatasetsInProject") 650 def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]: 651 """Gets all datasets in folder under a given storage project that the current user has access to. 652 653 Args: 654 projectId: synapse ID of a storage project. 655 656 Returns: 657 A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName). 658 None: If the projectId cannot be found on Synapse. 659 """ 660 661 # select all folders and fetch their names from within the storage project; 662 # if folder content type is defined, only select folders that contain datasets 663 if "contentType" in self.storageFileviewTable.columns: 664 foldersTable = self.storageFileviewTable[ 665 (self.storageFileviewTable["contentType"] == "dataset") 666 & (self.storageFileviewTable["projectId"] == projectId) 667 ] 668 else: 669 foldersTable = self.storageFileviewTable[ 670 (self.storageFileviewTable["type"] == "folder") 671 & (self.storageFileviewTable["parentId"] == projectId) 672 ] 673 674 # get an array of tuples (folderId, folderName) 675 # some folders are part of datasets; others contain datasets 676 # each dataset parent is the project; folders part of a dataset have another folder as a parent 677 # to get folders if and only if they contain datasets for each folder 678 # check if folder's parent is the project; if so that folder contains a dataset, 679 # unless the folder list has already been filtered to dataset folders based on contentType attribute above 680 681 datasetList = [] 682 folderProperties = ["id", "name"] 683 for folder in list( 684 foldersTable[folderProperties].itertuples(index=False, name=None) 685 ): 686 datasetList.append(folder) 687 688 sorted_dataset_list = sorted(datasetList, key=lambda tup: tup[0]) 689 690 return sorted_dataset_list 691 692 @tracer.start_as_current_span("SynapseStorage::getFilesInStorageDataset") 693 def getFilesInStorageDataset( 694 self, datasetId: str, fileNames: List = None, fullpath: bool = True 695 ) -> List[Tuple[str, str]]: 696 """Gets all files (excluding manifest files) in a given dataset folder. 697 698 Args: 699 datasetId: synapse ID of a storage dataset. 700 fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g. 701 metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present. 702 fullpath: if True return the full path as part of this filename; otherwise return just base filename 703 704 Returns: 705 A list of files; the list consists of tuples (fileId, fileName). 706 707 Raises: 708 ValueError: Dataset ID not found. 709 """ 710 file_list = [] 711 712 # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view 713 if self.storageFileviewTable.empty: 714 raise ValueError( 715 f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again." 716 ) 717 718 child_path = self.storageFileviewTable.loc[ 719 self.storageFileviewTable["parentId"] == datasetId, "path" 720 ] 721 if child_path.empty: 722 raise LookupError( 723 f"Dataset {datasetId} could not be found in fileview {self.storageFileview}." 724 ) 725 child_path = child_path.iloc[0] 726 727 # Get the dataset path by eliminating the child's portion of the path to account for nested datasets 728 parent = child_path.split("/")[:-1] 729 parent = "/".join(parent) 730 731 # Format dataset path to be used in table query 732 dataset_path = f"'{parent}/%'" 733 734 # When querying, only include files to exclude entity files and subdirectories 735 where_clauses = [f"path like {dataset_path}", "type='file'"] 736 737 # Requery the fileview to specifically get the files in the given dataset 738 self.query_fileview(columns=["id", "path"], where_clauses=where_clauses) 739 740 # Exclude manifest files 741 non_manifest_files = self.storageFileviewTable.loc[ 742 ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"), 743 :, 744 ] 745 746 # Remove all files that are not in the list of fileNames 747 if fileNames: 748 filename_regex = "|".join(fileNames) 749 750 matching_files = non_manifest_files["path"].str.contains( 751 filename_regex, case=False, regex=True 752 ) 753 754 non_manifest_files = non_manifest_files.loc[matching_files, :] 755 756 # Truncate path if necessary 757 if not fullpath: 758 non_manifest_files.path = non_manifest_files.path.apply(os.path.basename) 759 760 # Return list of files as expected by other methods 761 file_list = list(non_manifest_files.itertuples(index=False, name=None)) 762 763 return file_list 764 765 def _get_manifest_id(self, manifest: pd.DataFrame) -> str: 766 """If both censored and uncensored manifests are present, return uncensored manifest; if only one manifest is present, return manifest id of that manifest; if more than two manifests are present, return the manifest id of the first one. 767 Args: 768 manifest: a dataframe contains name and id of manifests in a given asset view 769 770 Return: 771 manifest_syn_id: id of a given censored or uncensored manifest 772 """ 773 censored_regex = re.compile(".*censored.*") 774 censored = manifest["name"].str.contains(censored_regex) 775 if any(censored): 776 # Try to use uncensored manifest first 777 not_censored = ~censored 778 if any(not_censored): 779 manifest_syn_id = manifest[not_censored]["id"].iloc[0] 780 # if only censored manifests are available, just use the first censored manifest 781 else: 782 manifest_syn_id = manifest["id"].iloc[0] 783 784 # otherwise, use the first (implied only) version that exists 785 else: 786 manifest_syn_id = manifest["id"].iloc[0] 787 788 return manifest_syn_id 789 790 @tracer.start_as_current_span("SynapseStorage::getDatasetManifest") 791 def getDatasetManifest( 792 self, 793 datasetId: str, 794 downloadFile: bool = False, 795 newManifestName: str = "", 796 use_temporary_folder: bool = True, 797 ) -> Union[str, File]: 798 """Gets the manifest associated with a given dataset. 799 800 Args: 801 datasetId: synapse ID of a storage dataset. 802 downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not. 803 newManifestName: new name of a manifest that gets downloaded 804 use_temporary_folder: boolean argument indicating if a temporary folder 805 should be used to store the manifest file. This is useful when running 806 this code as an API server where multiple requests could be made at the 807 same time. This is set to False when the code is being used from the 808 CLI. Defaults to True. 809 810 Returns: 811 manifest_syn_id (String): Synapse ID of exisiting manifest file. 812 manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True. 813 "" (String): No pre-exisiting manifest in dataset. 814 """ 815 manifest_data = "" 816 817 # get a list of files containing the manifest for this dataset (if any) 818 all_files = self.storageFileviewTable 819 820 # construct regex based on manifest basename in the config 821 manifest_re = re.compile(os.path.basename(self.manifest) + ".*.[tc]sv") 822 823 # search manifest based on given manifest basename regex above 824 # and return a dataframe containing name and id of manifests in a given asset view 825 manifest = all_files[ 826 (all_files["name"].str.contains(manifest_re, regex=True)) 827 & (all_files["parentId"] == datasetId) 828 ] 829 830 manifest = manifest[["id", "name"]] 831 832 # if there is no pre-exisiting manifest in the specified dataset 833 if manifest.empty: 834 logger.warning( 835 f"Could not find a manifest that fits basename {self.manifest} in asset view and dataset {datasetId}" 836 ) 837 return "" 838 839 # if there is an exisiting manifest 840 else: 841 manifest_syn_id = self._get_manifest_id(manifest) 842 if downloadFile: 843 md = ManifestDownload( 844 self.syn, 845 manifest_id=manifest_syn_id, 846 synapse_entity_tracker=self.synapse_entity_tracker, 847 ) 848 manifest_data = md.download_manifest( 849 newManifestName=newManifestName, 850 manifest_df=manifest, 851 use_temporary_folder=use_temporary_folder, 852 ) 853 # TO DO: revisit how downstream code handle manifest_data. If the downstream code would break when manifest_data is an empty string, 854 # then we should catch the error here without returning an empty string. 855 if not manifest_data: 856 logger.debug( 857 f"No manifest data returned. Please check if you have successfully downloaded manifest: {manifest_syn_id}" 858 ) 859 return manifest_data 860 return manifest_syn_id 861 862 def getDataTypeFromManifest(self, manifestId: str): 863 """Fetch a manifest and return data types of all columns 864 Args: 865 manifestId: synapse ID of a manifest 866 """ 867 # get manifest file path 868 manifest_entity = self.synapse_entity_tracker.get( 869 synapse_id=manifestId, syn=self.syn, download_file=True 870 ) 871 manifest_filepath = manifest_entity.path 872 873 # load manifest dataframe 874 manifest = load_df( 875 manifest_filepath, 876 preserve_raw_input=False, 877 data_model=False, 878 ) 879 880 # convert the dataFrame to use best possible dtypes. 881 manifest_new = manifest.convert_dtypes() 882 883 # get data types of columns 884 result = manifest_new.dtypes.to_frame("dtypes").reset_index() 885 886 # return the result as a dictionary 887 result_dict = result.set_index("index")["dtypes"].astype(str).to_dict() 888 889 return result_dict 890 891 def _get_files_metadata_from_dataset( 892 self, datasetId: str, only_new_files: bool, manifest: pd.DataFrame = None 893 ) -> Optional[dict]: 894 """retrieve file ids under a particular datasetId 895 896 Args: 897 datasetId (str): a dataset id 898 only_new_files (bool): if only adding new files that are not already exist 899 manifest (pd.DataFrame): metadata manifest dataframe. Default to None. 900 901 Returns: 902 a dictionary that contains filename and entityid under a given datasetId or None if there is nothing under a given dataset id are not available 903 """ 904 dataset_files = self.getFilesInStorageDataset(datasetId) 905 if dataset_files: 906 dataset_file_names_id_dict = self._get_file_entityIds( 907 dataset_files, only_new_files=only_new_files, manifest=manifest 908 ) 909 return dataset_file_names_id_dict 910 else: 911 return None 912 913 def add_entity_id_and_filename( 914 self, datasetId: str, manifest: pd.DataFrame 915 ) -> pd.DataFrame: 916 """add entityid and filename column to an existing manifest assuming entityId column is not already present 917 918 Args: 919 datasetId (str): dataset syn id 920 manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty 921 922 Returns: 923 pd.DataFrame: returns a pandas dataframe 924 """ 925 # get file names and entity ids of a given dataset 926 dataset_files_dict = self._get_files_metadata_from_dataset( 927 datasetId, only_new_files=False 928 ) 929 930 if dataset_files_dict: 931 # turn manifest dataframe back to a dictionary for operation 932 manifest_dict = manifest.to_dict("list") 933 934 # update Filename column 935 # add entityId column to the end 936 manifest_dict.update(dataset_files_dict) 937 938 # if the component column exists in existing manifest, fill up that column 939 if "Component" in manifest_dict.keys(): 940 manifest_dict["Component"] = manifest_dict["Component"] * max( 941 1, len(manifest_dict["Filename"]) 942 ) 943 944 # turn dictionary back to a dataframe 945 manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient="index") 946 manifest_df_updated = manifest_df_index.transpose() 947 948 # fill na with empty string 949 manifest_df_updated = manifest_df_updated.fillna("") 950 951 # drop index 952 manifest_df_updated = manifest_df_updated.reset_index(drop=True) 953 954 return manifest_df_updated 955 else: 956 return manifest 957 958 def fill_in_entity_id_filename( 959 self, datasetId: str, manifest: pd.DataFrame 960 ) -> Tuple[List, pd.DataFrame]: 961 """fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present. 962 963 Args: 964 datasetId (str): dataset syn id 965 manifest (pd.DataFrame): existing manifest dataframe. 966 967 Returns: 968 Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe 969 """ 970 # get dataset file names and entity id as a list of tuple 971 dataset_files = self.getFilesInStorageDataset(datasetId) 972 973 # update manifest with additional filenames, if any 974 # note that if there is an existing manifest and there are files in the dataset 975 # the columns Filename and entityId are assumed to be present in manifest schema 976 # TODO: use idiomatic panda syntax 977 if not dataset_files: 978 manifest = manifest.fillna("") 979 return dataset_files, manifest 980 981 all_files = self._get_file_entityIds( 982 dataset_files=dataset_files, only_new_files=False, manifest=manifest 983 ) 984 new_files = self._get_file_entityIds( 985 dataset_files=dataset_files, only_new_files=True, manifest=manifest 986 ) 987 988 all_files = pd.DataFrame(all_files) 989 new_files = pd.DataFrame(new_files) 990 991 # update manifest so that it contains new dataset files 992 manifest = ( 993 pd.concat([manifest, new_files], sort=False) 994 .reset_index() 995 .drop("index", axis=1) 996 ) 997 998 # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata 999 manifest_reindex = manifest.set_index("entityId") 1000 all_files_reindex = all_files.set_index("entityId") 1001 all_files_reindex_like_manifest = all_files_reindex.reindex_like( 1002 manifest_reindex 1003 ) 1004 1005 # Check if individual file paths in manifest and from synapse match 1006 file_paths_match = ( 1007 manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"] 1008 ) 1009 1010 # If all the paths do not match, update the manifest with the filepaths from synapse 1011 if not file_paths_match.all(): 1012 manifest_reindex.loc[ 1013 ~file_paths_match, "Filename" 1014 ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"] 1015 1016 # reformat manifest for further use 1017 manifest = manifest_reindex.reset_index() 1018 entityIdCol = manifest.pop("entityId") 1019 manifest.insert(len(manifest.columns), "entityId", entityIdCol) 1020 1021 manifest = manifest.fillna("") 1022 return dataset_files, manifest 1023 1024 @tracer.start_as_current_span("SynapseStorage::updateDatasetManifestFiles") 1025 def updateDatasetManifestFiles( 1026 self, dmge: DataModelGraphExplorer, datasetId: str, store: bool = True 1027 ) -> Union[Tuple[str, pd.DataFrame], None]: 1028 """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any. 1029 1030 Args: 1031 dmge: DataModelGraphExplorer Instance 1032 datasetId: synapse ID of a storage dataset. 1033 store: if set to True store updated manifest in asset store; if set to False 1034 return a Pandas dataframe containing updated manifest but do not store to asset store 1035 1036 1037 Returns: 1038 Synapse ID of updated manifest and Pandas dataframe containing the updated manifest. 1039 If there is no existing manifest or if the manifest does not have an entityId column, return None 1040 """ 1041 1042 # get existing manifest Synapse ID 1043 manifest_id = self.getDatasetManifest(datasetId) 1044 1045 # if there is no manifest return None 1046 if not manifest_id: 1047 return None 1048 1049 manifest_entity = self.synapse_entity_tracker.get( 1050 synapse_id=manifest_id, syn=self.syn, download_file=True 1051 ) 1052 manifest_filepath = manifest_entity.path 1053 manifest = load_df(manifest_filepath) 1054 1055 # If the manifest does not have an entityId column, trigger a new manifest to be generated 1056 if "entityId" not in manifest.columns: 1057 return None 1058 1059 manifest_is_file_based = "Filename" in manifest.columns 1060 1061 if manifest_is_file_based: 1062 # update manifest with additional filenames, if any 1063 # note that if there is an existing manifest and there are files in the dataset 1064 # the columns Filename and entityId are assumed to be present in manifest schema 1065 # TODO: use idiomatic panda syntax 1066 dataset_files, manifest = self.fill_in_entity_id_filename( 1067 datasetId, manifest 1068 ) 1069 if dataset_files: 1070 # update the manifest file, so that it contains the relevant entity IDs 1071 if store: 1072 manifest.to_csv(manifest_filepath, index=False) 1073 1074 # store manifest and update associated metadata with manifest on Synapse 1075 manifest_id = self.associateMetadataWithFiles( 1076 dmge, manifest_filepath, datasetId 1077 ) 1078 1079 return manifest_id, manifest 1080 1081 def _get_file_entityIds( 1082 self, 1083 dataset_files: List, 1084 only_new_files: bool = False, 1085 manifest: pd.DataFrame = None, 1086 ): 1087 """ 1088 Get a dictionary of files in a dataset. Either files that are not in the current manifest or all files 1089 1090 Args: 1091 manifest: metadata manifest 1092 dataset_file: List of all files in a dataset 1093 only_new_files: boolean to control whether only new files are returned or all files in the dataset 1094 Returns: 1095 files: dictionary of file names and entityIDs, with scope as specified by `only_new_files` 1096 """ 1097 files = {"Filename": [], "entityId": []} 1098 1099 if only_new_files: 1100 if manifest is None: 1101 raise UnboundLocalError( 1102 "No manifest was passed in, a manifest is required when `only_new_files` is True." 1103 ) 1104 1105 if "entityId" not in manifest.columns: 1106 raise ValueError( 1107 "The manifest in your dataset and/or top level folder must contain the 'entityId' column. " 1108 "Please generate an empty manifest without annotations, manually add annotations to the " 1109 "appropriate files in the manifest, and then try again." 1110 ) 1111 1112 # find new files (that are not in the current manifest) if any 1113 for file_id, file_name in dataset_files: 1114 if not file_id in manifest["entityId"].values: 1115 files["Filename"].append(file_name) 1116 files["entityId"].append(file_id) 1117 else: 1118 # get all files 1119 for file_id, file_name in dataset_files: 1120 files["Filename"].append(file_name) 1121 files["entityId"].append(file_id) 1122 1123 return files 1124 1125 @tracer.start_as_current_span("SynapseStorage::getProjectManifests") 1126 def getProjectManifests( 1127 self, projectId: str 1128 ) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]: 1129 """Gets all metadata manifest files across all datasets in a specified project. 1130 1131 Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest 1132 as a list of tuples, one for each manifest: 1133 [ 1134 ( 1135 (datasetId, dataName), 1136 (manifestId, manifestName), 1137 (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema 1138 ), 1139 ... 1140 ] 1141 1142 TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface 1143 """ 1144 component = None 1145 entity = None 1146 manifests = [] 1147 1148 datasets = self.getStorageDatasetsInProject(projectId) 1149 1150 for datasetId, datasetName in datasets: 1151 # encode information about the manifest in a simple list (so that R clients can unpack it) 1152 # eventually can serialize differently 1153 1154 # Get synID of manifest for a dataset 1155 manifestId = self.getDatasetManifest(datasetId) 1156 1157 # If a manifest exists, get the annotations for it, else return base 'manifest' tuple 1158 if manifestId: 1159 annotations = self.getFileAnnotations(manifestId) 1160 1161 # If manifest has annotations specifying component, use that 1162 if annotations and "Component" in annotations: 1163 component = annotations["Component"] 1164 entity = self.synapse_entity_tracker.get( 1165 synapse_id=manifestId, syn=self.syn, download_file=False 1166 ) 1167 manifest_name = entity["properties"]["name"] 1168 1169 # otherwise download the manifest and parse for information 1170 elif not annotations or "Component" not in annotations: 1171 logging.debug( 1172 f"No component annotations have been found for manifest {manifestId}. " 1173 "The manifest will be downloaded and parsed instead. " 1174 "For increased speed, add component annotations to manifest." 1175 ) 1176 1177 manifest_info = self.getDatasetManifest( 1178 datasetId, downloadFile=True 1179 ) 1180 manifest_name = manifest_info["properties"].get("name", "") 1181 1182 if not manifest_name: 1183 logger.error(f"Failed to download manifests from {datasetId}") 1184 1185 manifest_path = manifest_info["path"] 1186 1187 manifest_df = load_df(manifest_path) 1188 1189 # Get component from component column if it exists 1190 if ( 1191 "Component" in manifest_df 1192 and not manifest_df["Component"].empty 1193 ): 1194 list(set(manifest_df["Component"])) 1195 component = list(set(manifest_df["Component"])) 1196 1197 # Added to address issues raised during DCA testing 1198 if "" in component: 1199 component.remove("") 1200 1201 if len(component) == 1: 1202 component = component[0] 1203 elif len(component) > 1: 1204 logging.warning( 1205 f"Manifest {manifestId} is composed of multiple components. Schematic does not support mulit-component manifests at this time." 1206 "Behavior of manifests with multiple components is undefined" 1207 ) 1208 else: 1209 manifest_name = "" 1210 component = None 1211 if component: 1212 manifest = ( 1213 (datasetId, datasetName), 1214 (manifestId, manifest_name), 1215 (component, component), 1216 ) 1217 elif manifestId: 1218 logging.debug( 1219 f"Manifest {manifestId} does not have an associated Component" 1220 ) 1221 manifest = ( 1222 (datasetId, datasetName), 1223 (manifestId, manifest_name), 1224 ("", ""), 1225 ) 1226 else: 1227 manifest = ( 1228 (datasetId, datasetName), 1229 ("", ""), 1230 ("", ""), 1231 ) 1232 1233 if manifest: 1234 manifests.append(manifest) 1235 1236 return manifests 1237 1238 def upload_project_manifests_to_synapse( 1239 self, dmge: DataModelGraphExplorer, projectId: str 1240 ) -> List[str]: 1241 """Upload all metadata manifest files across all datasets in a specified project as tables in Synapse. 1242 1243 Returns: String of all the manifest_table_ids of all the manifests that have been loaded. 1244 """ 1245 1246 manifests = [] 1247 manifest_loaded = [] 1248 datasets = self.getStorageDatasetsInProject(projectId) 1249 1250 for datasetId, datasetName in datasets: 1251 # encode information about the manifest in a simple list (so that R clients can unpack it) 1252 # eventually can serialize differently 1253 1254 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1255 1256 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1257 if manifest_info: 1258 manifest_id = manifest_info["properties"]["id"] 1259 manifest_name = manifest_info["properties"]["name"] 1260 manifest_path = manifest_info["path"] 1261 manifest_df = load_df(manifest_path) 1262 manifest_table_id = uploadDB( 1263 dmge=dmge, 1264 manifest=manifest, 1265 datasetId=datasetId, 1266 table_name=datasetName, 1267 ) 1268 manifest_loaded.append(datasetName) 1269 return manifest_loaded 1270 1271 def upload_annotated_project_manifests_to_synapse( 1272 self, projectId: str, path_to_json_ld: str, dry_run: bool = False 1273 ) -> List[str]: 1274 """ 1275 Purpose: 1276 For all manifests in a project, upload them as a table and add annotations manifest csv. 1277 Assumes the manifest is already present as a CSV in a dataset in the project. 1278 1279 """ 1280 # Instantiate DataModelParser 1281 data_model_parser = DataModelParser(path_to_data_model=path_to_json_ld) 1282 # Parse Model 1283 parsed_data_model = data_model_parser.parse_model() 1284 1285 # Instantiate DataModelGraph 1286 data_model_grapher = DataModelGraph(parsed_data_model) 1287 1288 # Generate graph 1289 graph_data_model = data_model_grapher.generate_data_model_graph() 1290 1291 # Instantiate DataModelGraphExplorer 1292 dmge = DataModelGraphExplorer(graph_data_model) 1293 1294 manifests = [] 1295 manifest_loaded = [] 1296 datasets = self.getStorageDatasetsInProject(projectId) 1297 for datasetId, datasetName in datasets: 1298 # encode information about the manifest in a simple list (so that R clients can unpack it) 1299 # eventually can serialize differently 1300 1301 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1302 manifests.append(manifest) 1303 1304 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1305 1306 if manifest_info: 1307 manifest_id = manifest_info["properties"]["id"] 1308 manifest_name = manifest_info["properties"]["name"] 1309 manifest_path = manifest_info["path"] 1310 manifest = ( 1311 (datasetId, datasetName), 1312 (manifest_id, manifest_name), 1313 ("", ""), 1314 ) 1315 if not dry_run: 1316 self.associateMetadataWithFiles( 1317 dmge, manifest_path, datasetId, manifest_record_type="table" 1318 ) 1319 manifest_loaded.append(manifest) 1320 1321 return manifests, manifest_loaded 1322 1323 def move_entities_to_new_project( 1324 self, 1325 projectId: str, 1326 newProjectId: str, 1327 returnEntities: bool = False, 1328 dry_run: bool = False, 1329 ): 1330 """ 1331 For each manifest csv in a project, look for all the entitiy ids that are associated. 1332 Look up the entitiy in the files, move the entity to new project. 1333 """ 1334 1335 manifests = [] 1336 manifest_loaded = [] 1337 datasets = self.getStorageDatasetsInProject(projectId) 1338 if datasets: 1339 for datasetId, datasetName in datasets: 1340 # encode information about the manifest in a simple list (so that R clients can unpack it) 1341 # eventually can serialize differently 1342 1343 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1344 manifests.append(manifest) 1345 1346 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1347 if manifest_info: 1348 manifest_id = manifest_info["properties"]["id"] 1349 manifest_name = manifest_info["properties"]["name"] 1350 manifest_path = manifest_info["path"] 1351 manifest_df = load_df(manifest_path) 1352 1353 manifest = ( 1354 (datasetId, datasetName), 1355 (manifest_id, manifest_name), 1356 ("", ""), 1357 ) 1358 manifest_loaded.append(manifest) 1359 1360 annotation_entities = self.storageFileviewTable[ 1361 (self.storageFileviewTable["id"].isin(manifest_df["entityId"])) 1362 & (self.storageFileviewTable["type"] == "folder") 1363 ]["id"] 1364 1365 if returnEntities: 1366 for entityId in annotation_entities: 1367 if not dry_run: 1368 moved_entity = self.syn.move(entityId, datasetId) 1369 self.synapse_entity_tracker.add( 1370 synapse_id=moved_entity.id, entity=moved_entity 1371 ) 1372 else: 1373 logging.info( 1374 f"{entityId} will be moved to folder {datasetId}." 1375 ) 1376 else: 1377 # generate project folder 1378 archive_project_folder = Folder( 1379 projectId + "_archive", parent=newProjectId 1380 ) 1381 archive_project_folder = self.syn.store(archive_project_folder) 1382 self.synapse_entity_tracker.add( 1383 synapse_id=archive_project_folder.id, 1384 entity=archive_project_folder, 1385 ) 1386 1387 # generate dataset folder 1388 dataset_archive_folder = Folder( 1389 "_".join([datasetId, datasetName, "archive"]), 1390 parent=archive_project_folder.id, 1391 ) 1392 dataset_archive_folder = self.syn.store(dataset_archive_folder) 1393 self.synapse_entity_tracker.add( 1394 synapse_id=dataset_archive_folder.id, 1395 entity=dataset_archive_folder, 1396 ) 1397 1398 for entityId in annotation_entities: 1399 # move entities to folder 1400 if not dry_run: 1401 moved_entity = self.syn.move( 1402 entityId, dataset_archive_folder.id 1403 ) 1404 self.synapse_entity_tracker.add( 1405 synapse_id=moved_entity.id, entity=moved_entity 1406 ) 1407 else: 1408 logging.info( 1409 f"{entityId} will be moved to folder {dataset_archive_folder.id}." 1410 ) 1411 else: 1412 raise LookupError( 1413 f"No datasets were found in the specified project: {projectId}. Re-check specified master_fileview in CONFIG and retry." 1414 ) 1415 return manifests, manifest_loaded 1416 1417 @tracer.start_as_current_span("SynapseStorage::get_synapse_table") 1418 def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable]: 1419 """Download synapse table as a pd dataframe; return table schema and etags as results too 1420 1421 Args: 1422 synapse_id: synapse ID of the table to query 1423 """ 1424 1425 results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id)) 1426 df = results.asDataFrame( 1427 rowIdAndVersionInIndex=False, 1428 na_values=STR_NA_VALUES_FILTERED, 1429 keep_default_na=False, 1430 ) 1431 1432 return df, results 1433 1434 @missing_entity_handler 1435 @tracer.start_as_current_span("SynapseStorage::uploadDB") 1436 def uploadDB( 1437 self, 1438 dmge: DataModelGraphExplorer, 1439 manifest: pd.DataFrame, 1440 datasetId: str, 1441 table_name: str, 1442 restrict: bool = False, 1443 table_manipulation: str = "replace", 1444 table_column_names: str = "class_label", 1445 ): 1446 """ 1447 Method to upload a database to an asset store. In synapse, this will upload a metadata table 1448 1449 Args: 1450 dmge: DataModelGraphExplorer object 1451 manifest: pd.Df manifest to upload 1452 datasetId: synID of the dataset for the manifest 1453 table_name: name of the table to be uploaded 1454 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 1455 existingTableId: str of the synId of the existing table, if one already exists 1456 table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert) 1457 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 1458 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 1459 display label formatting. 1460 Returns: 1461 manifest_table_id: synID of the uploaded table 1462 manifest: the original manifset 1463 table_manifest: manifest formatted appropriately for the table 1464 1465 """ 1466 1467 col_schema, table_manifest = self.formatDB( 1468 dmge=dmge, manifest=manifest, table_column_names=table_column_names 1469 ) 1470 1471 manifest_table_id = self.buildDB( 1472 datasetId, 1473 table_name, 1474 col_schema, 1475 table_manifest, 1476 table_manipulation, 1477 dmge, 1478 restrict, 1479 ) 1480 1481 return manifest_table_id, manifest, table_manifest 1482 1483 @tracer.start_as_current_span("SynapseStorage::formatDB") 1484 def formatDB(self, dmge, manifest, table_column_names): 1485 """ 1486 Method to format a manifest appropriatly for upload as table 1487 1488 Args: 1489 dmge: DataModelGraphExplorer object 1490 manifest: pd.Df manifest to upload 1491 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 1492 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 1493 display label formatting. 1494 Returns: 1495 col_schema: schema for table columns: type, size, etc 1496 table_manifest: formatted manifest 1497 1498 """ 1499 # Rename the manifest columns to display names to match fileview 1500 1501 blacklist_chars = ["(", ")", ".", " ", "-"] 1502 manifest_columns = manifest.columns.tolist() 1503 1504 table_manifest = deepcopy(manifest) 1505 1506 if table_column_names == "display_name": 1507 cols = table_manifest.columns 1508 1509 elif table_column_names == "display_label": 1510 cols = [ 1511 str(col).translate({ord(x): "" for x in blacklist_chars}) 1512 for col in manifest_columns 1513 ] 1514 1515 elif table_column_names == "class_label": 1516 cols = [ 1517 get_class_label_from_display_name(str(col)).translate( 1518 {ord(x): "" for x in blacklist_chars} 1519 ) 1520 for col in manifest_columns 1521 ] 1522 else: 1523 ValueError( 1524 f"The provided table_column_name: {table_column_names} is not valid, please resubmit with an allowed value only." 1525 ) 1526 1527 cols = list(map(lambda x: x.replace("EntityId", "entityId"), cols)) 1528 1529 # Reset column names in table manifest 1530 table_manifest.columns = cols 1531 1532 # move entity id to end of df 1533 entity_col = table_manifest.pop("entityId") 1534 table_manifest.insert(len(table_manifest.columns), "entityId", entity_col) 1535 1536 # Get the column schema 1537 col_schema = as_table_columns(table_manifest) 1538 1539 # Set Id column length to 64 (for some reason not being auto set.) 1540 for i, col in enumerate(col_schema): 1541 if col["name"].lower() == "id": 1542 col_schema[i]["maximumSize"] = 64 1543 1544 return col_schema, table_manifest 1545 1546 @tracer.start_as_current_span("SynapseStorage::buildDB") 1547 def buildDB( 1548 self, 1549 datasetId: str, 1550 table_name: str, 1551 col_schema: List, 1552 table_manifest: pd.DataFrame, 1553 table_manipulation: str, 1554 dmge: DataModelGraphExplorer, 1555 restrict: bool = False, 1556 ): 1557 """ 1558 Method to construct the table appropriately: create new table, replace existing, or upsert new into existing 1559 Calls TableOperations class to execute 1560 1561 Args: 1562 datasetId: synID of the dataset for the manifest 1563 table_name: name of the table to be uploaded 1564 col_schema: schema for table columns: type, size, etc from `formatDB` 1565 table_manifest: formatted manifest that can be uploaded as a table 1566 table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert) 1567 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 1568 1569 Returns: 1570 manifest_table_id: synID of the uploaded table 1571 1572 """ 1573 table_parent_id = self.getDatasetProject(datasetId=datasetId) 1574 existing_table_id = self.syn.findEntityId( 1575 name=table_name, parent=table_parent_id 1576 ) 1577 1578 tableOps = TableOperations( 1579 synStore=self, 1580 tableToLoad=table_manifest, 1581 tableName=table_name, 1582 datasetId=datasetId, 1583 existingTableId=existing_table_id, 1584 restrict=restrict, 1585 synapse_entity_tracker=self.synapse_entity_tracker, 1586 ) 1587 1588 if not table_manipulation or existing_table_id is None: 1589 manifest_table_id = tableOps.createTable( 1590 columnTypeDict=col_schema, 1591 specifySchema=True, 1592 ) 1593 elif existing_table_id is not None: 1594 if table_manipulation.lower() == "replace": 1595 manifest_table_id = tableOps.replaceTable( 1596 specifySchema=True, 1597 columnTypeDict=col_schema, 1598 ) 1599 elif table_manipulation.lower() == "upsert": 1600 manifest_table_id = tableOps.upsertTable( 1601 dmge=dmge, 1602 ) 1603 elif table_manipulation.lower() == "update": 1604 manifest_table_id = tableOps.updateTable() 1605 1606 if table_manipulation and table_manipulation.lower() == "upsert": 1607 table_entity = self.synapse_entity_tracker.get( 1608 synapse_id=existing_table_id or manifest_table_id, 1609 syn=self.syn, 1610 download_file=False, 1611 ) 1612 annos = OldAnnotations( 1613 id=table_entity.id, 1614 etag=table_entity.etag, 1615 values=table_entity.annotations, 1616 ) 1617 annos["primary_key"] = table_manifest["Component"][0] + "_id" 1618 annos = self.syn.set_annotations(annos) 1619 table_entity.etag = annos.etag 1620 table_entity.annotations = annos 1621 1622 return manifest_table_id 1623 1624 @tracer.start_as_current_span("SynapseStorage::upload_manifest_file") 1625 def upload_manifest_file( 1626 self, 1627 manifest, 1628 metadataManifestPath, 1629 datasetId, 1630 restrict_manifest, 1631 component_name="", 1632 ): 1633 # Update manifest to have the new entityId column 1634 manifest.to_csv(metadataManifestPath, index=False) 1635 1636 # store manifest to Synapse as a CSV 1637 # update file name 1638 file_name_full = metadataManifestPath.split("/")[-1] 1639 file_extension = file_name_full.split(".")[-1] 1640 1641 # Differentiate "censored" and "uncensored" manifest 1642 if "censored" in file_name_full: 1643 file_name_new = ( 1644 os.path.basename(CONFIG.synapse_manifest_basename) 1645 + "_" 1646 + component_name 1647 + "_censored" 1648 + "." 1649 + file_extension 1650 ) 1651 else: 1652 file_name_new = ( 1653 os.path.basename(CONFIG.synapse_manifest_basename) 1654 + "_" 1655 + component_name 1656 + "." 1657 + file_extension 1658 ) 1659 1660 manifest_synapse_file = None 1661 try: 1662 # Rename the file to file_name_new then revert 1663 # This is to maintain the original file name in-case other code is 1664 # expecting that the file exists with the original name 1665 original_file_path = metadataManifestPath 1666 new_file_path = os.path.join( 1667 os.path.dirname(metadataManifestPath), file_name_new 1668 ) 1669 os.rename(original_file_path, new_file_path) 1670 1671 manifest_synapse_file = self._store_file_for_manifest_upload( 1672 new_file_path=new_file_path, 1673 dataset_id=datasetId, 1674 existing_file_name=file_name_full, 1675 file_name_new=file_name_new, 1676 restrict_manifest=restrict_manifest, 1677 ) 1678 manifest_synapse_file_id = manifest_synapse_file.id 1679 1680 finally: 1681 # Revert the file name back to the original 1682 os.rename(new_file_path, original_file_path) 1683 1684 if manifest_synapse_file: 1685 manifest_synapse_file.path = original_file_path 1686 1687 return manifest_synapse_file_id 1688 1689 def _store_file_for_manifest_upload( 1690 self, 1691 new_file_path: str, 1692 dataset_id: str, 1693 existing_file_name: str, 1694 file_name_new: str, 1695 restrict_manifest: bool, 1696 ) -> File: 1697 """Handles a create or update of a manifest file that is going to be uploaded. 1698 If we already have a copy of the Entity in memory we will update that instance, 1699 otherwise create a new File instance to be created in Synapse. Once stored 1700 this will add the file to the `synapse_entity_tracker` for future reference. 1701 1702 Args: 1703 new_file_path (str): The path to the new manifest file 1704 dataset_id (str): The Synapse ID of the dataset the manifest is associated with 1705 existing_file_name (str): The name of the existing file 1706 file_name_new (str): The name of the new file 1707 restrict_manifest (bool): Whether the manifest should be restricted 1708 1709 Returns: 1710 File: The stored manifest file 1711 """ 1712 local_tracked_file_instance = ( 1713 self.synapse_entity_tracker.search_local_by_parent_and_name( 1714 name=existing_file_name, parent_id=dataset_id 1715 ) 1716 or self.synapse_entity_tracker.search_local_by_parent_and_name( 1717 name=file_name_new, parent_id=dataset_id 1718 ) 1719 ) 1720 1721 if local_tracked_file_instance: 1722 local_tracked_file_instance.path = new_file_path 1723 local_tracked_file_instance.description = ( 1724 "Manifest for dataset " + dataset_id 1725 ) 1726 manifest_synapse_file = local_tracked_file_instance 1727 else: 1728 manifest_synapse_file = File( 1729 path=new_file_path, 1730 description="Manifest for dataset " + dataset_id, 1731 parent=dataset_id, 1732 name=file_name_new, 1733 ) 1734 1735 manifest_synapse_file = self.syn.store( 1736 manifest_synapse_file, isRestricted=restrict_manifest 1737 ) 1738 1739 self.synapse_entity_tracker.add( 1740 synapse_id=manifest_synapse_file.id, entity=manifest_synapse_file 1741 ) 1742 return manifest_synapse_file 1743 1744 async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]: 1745 """get annotations asynchronously 1746 1747 Args: 1748 synapse_id (str): synapse id of the entity that the annotation belongs 1749 1750 Returns: 1751 Dict[str, Any]: The requested entity bundle matching 1752 <https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html> 1753 """ 1754 return await get_entity_id_bundle2( 1755 entity_id=synapse_id, 1756 request={"includeAnnotations": True}, 1757 synapse_client=self.syn, 1758 ) 1759 1760 async def store_async_annotation(self, annotation_dict: dict) -> Annotations: 1761 """store annotation in an async way 1762 1763 Args: 1764 annotation_dict (dict): annotation in a dictionary format 1765 1766 Returns: 1767 Annotations: The stored annotations. 1768 """ 1769 annotation_data = Annotations.from_dict( 1770 synapse_annotations=annotation_dict["annotations"]["annotations"] 1771 ) 1772 annotation_class = Annotations( 1773 annotations=annotation_data, 1774 etag=annotation_dict["annotations"]["etag"], 1775 id=annotation_dict["annotations"]["id"], 1776 ) 1777 annotation_storage_result = await annotation_class.store_async( 1778 synapse_client=self.syn 1779 ) 1780 local_entity = self.synapse_entity_tracker.get( 1781 synapse_id=annotation_dict["annotations"]["id"], 1782 syn=self.syn, 1783 download_file=False, 1784 retrieve_if_not_present=False, 1785 ) 1786 if local_entity: 1787 local_entity.etag = annotation_storage_result.etag 1788 local_entity.annotations = annotation_storage_result 1789 return annotation_storage_result 1790 1791 def process_row_annotations( 1792 self, 1793 dmge: DataModelGraphExplorer, 1794 metadata_syn: Dict[str, Any], 1795 hide_blanks: bool, 1796 csv_list_regex: str, 1797 annos: Dict[str, Any], 1798 annotation_keys: str, 1799 ) -> Dict[str, Any]: 1800 """Processes metadata annotations based on the logic below: 1801 1. Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is: 1802 An empty or whitespace-only string. 1803 A NaN value (if the annotation is a float). 1804 if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key. 1805 if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key. 1806 1807 2. If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name". 1808 Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key. 1809 1810 3. For any other conditions, assigns the original value of anno_v to the annotation key (anno_k). 1811 1812 4. Returns the updated annotations dictionary. 1813 1814 Args: 1815 dmge (DataModelGraphExplorer): data model graph explorer 1816 metadata_syn (dict): metadata used for Synapse storage 1817 hideBlanks (bool): if true, does not upload annotation keys with blank values. 1818 csv_list_regex (str): Regex to match with comma separated list 1819 annos (Dict[str, Any]): dictionary of annotation returned from synapse 1820 annotation_keys (str): display_label/class_label 1821 1822 Returns: 1823 Dict[str, Any]: annotations as a dictionary 1824 1825 ```mermaid 1826 flowchart TD 1827 A[Start] --> C{Is anno_v empty, whitespace, or NaN?} 1828 C -- Yes --> D{Is hide_blanks True?} 1829 D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing] 1830 D -- No --> F[Assign empty string to annotation key] 1831 C -- No --> G{Is anno_v a string?} 1832 G -- No --> H[Assign original value of anno_v to annotation key] 1833 G -- Yes --> I{Does anno_v match csv_list_regex?} 1834 I -- Yes --> J[Get validation rule of anno_k] 1835 J --> K{Does the validation rule contain 'list'} 1836 K -- Yes --> L[Split anno_v by commas and assign as list] 1837 I -- No --> H 1838 K -- No --> H 1839 ``` 1840 """ 1841 for anno_k, anno_v in metadata_syn.items(): 1842 # Remove keys with nan or empty string values or string that only contains white space from dict of annotations to be uploaded 1843 # if present on current data annotation 1844 if hide_blanks and ( 1845 (isinstance(anno_v, str) and anno_v.strip() == "") 1846 or (isinstance(anno_v, float) and np.isnan(anno_v)) 1847 ): 1848 annos["annotations"]["annotations"].pop(anno_k) if anno_k in annos[ 1849 "annotations" 1850 ]["annotations"].keys() else annos["annotations"]["annotations"] 1851 continue 1852 1853 # Otherwise save annotation as approrpriate 1854 if isinstance(anno_v, float) and np.isnan(anno_v): 1855 annos["annotations"]["annotations"][anno_k] = "" 1856 continue 1857 1858 # Handle strings that match the csv_list_regex and pass the validation rule 1859 if isinstance(anno_v, str) and re.fullmatch(csv_list_regex, anno_v): 1860 # Use a dictionary to dynamically choose the argument 1861 param = ( 1862 {"node_display_name": anno_k} 1863 if annotation_keys == "display_label" 1864 else {"node_label": anno_k} 1865 ) 1866 node_validation_rules = dmge.get_node_validation_rules(**param) 1867 1868 if rule_in_rule_list("list", node_validation_rules): 1869 annos["annotations"]["annotations"][anno_k] = anno_v.split(",") 1870 continue 1871 # default: assign the original value 1872 annos["annotations"]["annotations"][anno_k] = anno_v 1873 1874 return annos 1875 1876 @async_missing_entity_handler 1877 async def format_row_annotations( 1878 self, 1879 dmge: DataModelGraphExplorer, 1880 row: pd.Series, 1881 entityId: str, 1882 hideBlanks: bool, 1883 annotation_keys: str, 1884 ) -> Union[None, Dict[str, Any]]: 1885 """Format row annotations 1886 1887 Args: 1888 dmge (DataModelGraphExplorer): data moodel graph explorer object 1889 row (pd.Series): row of the manifest 1890 entityId (str): entity id of the manifest 1891 hideBlanks (bool): when true, does not upload annotation keys with blank values. When false, upload Annotation keys with empty string values 1892 annotation_keys (str): display_label/class_label 1893 1894 Returns: 1895 Union[None, Dict[str,]]: if entity id is in trash can, return None. Otherwise, return the annotations 1896 """ 1897 # prepare metadata for Synapse storage (resolve display name into a name that Synapse annotations support (e.g no spaces, parenthesis) 1898 # note: the removal of special characters, will apply only to annotation keys; we are not altering the manifest 1899 # this could create a divergence between manifest column and annotations. this should be ok for most use cases. 1900 # columns with special characters are outside of the schema 1901 metadataSyn = {} 1902 blacklist_chars = ["(", ")", ".", " ", "-"] 1903 1904 for k, v in row.to_dict().items(): 1905 if annotation_keys == "display_label": 1906 keySyn = str(k).translate({ord(x): "" for x in blacklist_chars}) 1907 elif annotation_keys == "class_label": 1908 keySyn = get_class_label_from_display_name(str(k)).translate( 1909 {ord(x): "" for x in blacklist_chars} 1910 ) 1911 1912 # Skip `Filename` and `ETag` columns when setting annotations 1913 if keySyn in ["Filename", "ETag", "eTag"]: 1914 continue 1915 1916 # truncate annotation values to 500 characters if the 1917 # size of values is greater than equal to 500 characters 1918 # add an explicit [truncatedByDataCuratorApp] message at the end 1919 # of every truncated message to indicate that the cell value 1920 # has been truncated 1921 if isinstance(v, str) and len(v) >= 500: 1922 v = v[0:472] + "[truncatedByDataCuratorApp]" 1923 1924 metadataSyn[keySyn] = v 1925 1926 # This will first check if the entity is already in memory, and if so, that 1927 # instance is used. Unfortunately, the expected return format needs to match 1928 # the Synapse API, so we need to convert the annotations to the expected format. 1929 entity = self.synapse_entity_tracker.get( 1930 synapse_id=entityId, 1931 syn=self.syn, 1932 download_file=False, 1933 retrieve_if_not_present=False, 1934 ) 1935 if entity is not None: 1936 synapse_annotations = _convert_to_annotations_list( 1937 annotations=entity.annotations 1938 ) 1939 annos = { 1940 "annotations": { 1941 "id": entity.id, 1942 "etag": entity.etag, 1943 "annotations": synapse_annotations, 1944 } 1945 } 1946 else: 1947 annos = await self.get_async_annotation(entityId) 1948 1949 # set annotation(s) for the various objects/items in a dataset on Synapse 1950 csv_list_regex = comma_separated_list_regex() 1951 1952 annos = self.process_row_annotations( 1953 dmge=dmge, 1954 metadata_syn=metadataSyn, 1955 hide_blanks=hideBlanks, 1956 csv_list_regex=csv_list_regex, 1957 annos=annos, 1958 annotation_keys=annotation_keys, 1959 ) 1960 1961 return annos 1962 1963 @missing_entity_handler 1964 @tracer.start_as_current_span("SynapseStorage::format_manifest_annotations") 1965 def format_manifest_annotations(self, manifest, manifest_synapse_id): 1966 """ 1967 Set annotations for the manifest (as a whole) so they can be applied to the manifest table or csv. 1968 For now just getting the Component. 1969 """ 1970 1971 entity = self.synapse_entity_tracker.get( 1972 synapse_id=manifest_synapse_id, syn=self.syn, download_file=False 1973 ) 1974 is_file = entity.concreteType.endswith(".FileEntity") 1975 is_table = entity.concreteType.endswith(".TableEntity") 1976 1977 if is_file: 1978 # Get file metadata 1979 metadata = self.getFileAnnotations(manifest_synapse_id) 1980 1981 # If there is a defined component add it to the metadata. 1982 if "Component" in manifest.columns: 1983 # Gather component information 1984 component = manifest["Component"].unique() 1985 1986 # Double check that only a single component is listed, else raise an error. 1987 try: 1988 len(component) == 1 1989 except ValueError as err: 1990 raise ValueError( 1991 f"Manifest has more than one component. Please check manifest and resubmit." 1992 ) from err 1993 1994 # Add component to metadata 1995 metadata["Component"] = component[0] 1996 1997 elif is_table: 1998 # Get table metadata 1999 metadata = self.getTableAnnotations(manifest_synapse_id) 2000 2001 # Get annotations 2002 annos = OldAnnotations( 2003 id=entity.id, etag=entity.etag, values=entity.annotations 2004 ) 2005 2006 # Add metadata to the annotations 2007 for annos_k, annos_v in metadata.items(): 2008 annos[annos_k] = annos_v 2009 2010 return annos 2011 2012 ''' 2013 def annotate_upload_manifest_table(self, manifest, datasetId, metadataManifestPath, 2014 useSchemaLabel: bool = True, hideBlanks: bool = False, restrict_manifest = False): 2015 """ 2016 Purpose: 2017 Works very similarly to associateMetadataWithFiles except takes in the manifest 2018 rather than the manifest path 2019 2020 """ 2021 2022 # Add uuid for table updates and fill. 2023 if not "Uuid" in manifest.columns: 2024 manifest["Uuid"] = '' 2025 2026 for idx,row in manifest.iterrows(): 2027 if not row["Uuid"]: 2028 gen_uuid = uuid.uuid4() 2029 row["Uuid"] = gen_uuid 2030 manifest.loc[idx, 'Uuid'] = gen_uuid 2031 2032 # add entityId as a column if not already there or 2033 # fill any blanks with an empty string. 2034 if not "entityId" in manifest.columns: 2035 manifest["entityId"] = "" 2036 else: 2037 manifest["entityId"].fillna("", inplace=True) 2038 2039 # get a DataModelGraphExplorer object to ensure schema attribute names used in manifest are translated to schema labels for synapse annotations 2040 dmge = DataModelGraphExplorer() 2041 2042 # Create table name here. 2043 if 'Component' in manifest.columns: 2044 table_name = manifest['Component'][0].lower() + '_synapse_storage_manifest_table' 2045 else: 2046 table_name = 'synapse_storage_manifest_table' 2047 2048 # Upload manifest as a table and get the SynID and manifest 2049 manifest_synapse_table_id, manifest, table_manifest = self.upload_format_manifest_table( 2050 dmge, manifest, datasetId, table_name, restrict = restrict_manifest, useSchemaLabel=useSchemaLabel,) 2051 2052 # Iterate over manifest rows, create Synapse entities and store corresponding entity IDs in manifest if needed 2053 # also set metadata for each synapse entity as Synapse annotations 2054 for idx, row in manifest.iterrows(): 2055 if not row["entityId"]: 2056 # If not using entityIds, fill with manifest_table_id so 2057 row["entityId"] = manifest_synapse_table_id 2058 entityId = '' 2059 else: 2060 # get the entity id corresponding to this row 2061 entityId = row["entityId"] 2062 2063 # Load manifest to synapse as a CSV File 2064 manifest_synapse_file_id = self.upload_manifest_file(manifest, metadataManifestPath, datasetId, restrict_manifest) 2065 2066 # Get annotations for the file manifest. 2067 manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_file_id) 2068 2069 self.syn.set_annotations(manifest_annotations) 2070 2071 logger.info("Associated manifest file with dataset on Synapse.") 2072 2073 # Update manifest Synapse table with new entity id column. 2074 self.make_synapse_table( 2075 table_to_load = table_manifest, 2076 dataset_id = datasetId, 2077 existingTableId = manifest_synapse_table_id, 2078 table_name = table_name, 2079 update_col = 'Uuid', 2080 specify_schema = False, 2081 ) 2082 2083 # Get annotations for the table manifest 2084 manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_table_id) 2085 self.syn.set_annotations(manifest_annotations) 2086 return manifest_synapse_table_id 2087 ''' 2088 2089 def _read_manifest(self, metadataManifestPath: str) -> pd.DataFrame: 2090 """Helper function to read in provided manifest as a pandas DataFrame for subsequent downstream processing. 2091 Args: 2092 metadataManifestPath (str): path where manifest is stored 2093 Returns: 2094 manifest(pd.DataFrame): Manifest loaded as a pandas dataframe 2095 Raises: 2096 FileNotFoundError: Manifest file does not exist at provided path. 2097 """ 2098 # read new manifest csv 2099 try: 2100 load_args = { 2101 "dtype": "string", 2102 } 2103 manifest = load_df( 2104 metadataManifestPath, 2105 preserve_raw_input=False, 2106 allow_na_values=False, 2107 **load_args, 2108 ) 2109 except FileNotFoundError as err: 2110 raise FileNotFoundError( 2111 f"No manifest file was found at this path: {metadataManifestPath}" 2112 ) from err 2113 return manifest 2114 2115 def _add_id_columns_to_manifest( 2116 self, manifest: pd.DataFrame, dmge: DataModelGraphExplorer 2117 ): 2118 """Helper function to add id and entityId columns to the manifest if they do not already exist, Fill id values per row. 2119 Args: 2120 Manifest loaded as a pd.Dataframe 2121 Returns (pd.DataFrame): 2122 Manifest df with new Id and EntityId columns (and UUID values) if they were not already present. 2123 """ 2124 2125 # Add Id for table updates and fill. 2126 if not col_in_dataframe("Id", manifest): 2127 # See if schema has `Uuid` column specified 2128 try: 2129 uuid_col_in_schema = dmge.is_class_in_schema( 2130 "Uuid" 2131 ) or dmge.is_class_in_schema("uuid") 2132 except KeyError: 2133 uuid_col_in_schema = False 2134 2135 # Rename `Uuid` column if it wasn't specified in the schema 2136 if col_in_dataframe("Uuid", manifest) and not uuid_col_in_schema: 2137 manifest.rename(columns={"Uuid": "Id"}, inplace=True) 2138 # If no `Uuid` column exists or it is specified in the schema, create a new `Id` column 2139 else: 2140 manifest["Id"] = "" 2141 2142 # Retrieve the ID column name (id, Id and ID) are treated the same. 2143 id_col_name = [col for col in manifest.columns if col.lower() == "id"][0] 2144 2145 # Check if values have been added to the Id coulumn, if not add a UUID so value in the row is not blank. 2146 for idx, row in manifest.iterrows(): 2147 if not row[id_col_name]: 2148 gen_uuid = str(uuid.uuid4()) 2149 row[id_col_name] = gen_uuid 2150 manifest.loc[idx, id_col_name] = gen_uuid 2151 2152 # add entityId as a column if not already there or 2153 # fill any blanks with an empty string. 2154 if not col_in_dataframe("entityId", manifest): 2155 manifest["entityId"] = "" 2156 else: 2157 manifest["entityId"].fillna("", inplace=True) 2158 2159 return manifest 2160 2161 def _generate_table_name(self, manifest): 2162 """Helper function to generate a table name for upload to synapse. 2163 2164 Args: 2165 Manifest loaded as a pd.Dataframe 2166 2167 Returns: 2168 table_name (str): Name of the table to load 2169 component_name (str): Name of the manifest component (if applicable) 2170 """ 2171 # Create table name here. 2172 if "Component" in manifest.columns: 2173 component_name = manifest["Component"][0].lower() 2174 table_name = component_name + "_synapse_storage_manifest_table" 2175 else: 2176 component_name = "" 2177 table_name = "synapse_storage_manifest_table" 2178 return table_name, component_name 2179 2180 def _create_entity_id(self, idx, row, manifest, datasetId): 2181 """Helper function to generate an entityId and add it to the appropriate row in the manifest. 2182 Args: 2183 row: current row of manifest being processed 2184 manifest (pd.DataFrame): loaded df containing user supplied data. 2185 datasetId (str): synapse ID of folder containing the dataset 2186 2187 Returns: 2188 manifest (pd.DataFrame): manifest with entityId added to the appropriate row 2189 entityId (str): Generated Entity Id. 2190 2191 """ 2192 rowEntity = Folder(str(uuid.uuid4()), parent=datasetId) 2193 rowEntity = self.syn.store(rowEntity) 2194 entityId = rowEntity["id"] 2195 self.synapse_entity_tracker.add(synapse_id=entityId, entity=rowEntity) 2196 row["entityId"] = entityId 2197 manifest.loc[idx, "entityId"] = entityId 2198 return manifest, entityId 2199 2200 async def _process_store_annos(self, requests: Set[asyncio.Task]) -> None: 2201 """Process annotations and store them on synapse asynchronously 2202 2203 Args: 2204 requests (Set[asyncio.Task]): a set of tasks of formatting annotations created by format_row_annotations function in previous step 2205 2206 Raises: 2207 RuntimeError: raise a run time error if a task failed to complete 2208 """ 2209 while requests: 2210 done_tasks, pending_tasks = await asyncio.wait( 2211 requests, return_when=asyncio.FIRST_COMPLETED 2212 ) 2213 requests = pending_tasks 2214 2215 for completed_task in done_tasks: 2216 try: 2217 annos = completed_task.result() 2218 2219 if isinstance(annos, Annotations): 2220 logger.info(f"Successfully stored annotations for {annos.id}") 2221 else: 2222 # store annotations if they are not None 2223 if annos: 2224 entity_id = annos["annotations"]["id"] 2225 logger.info( 2226 f"Obtained and processed annotations for {entity_id} entity" 2227 ) 2228 requests.add( 2229 asyncio.create_task( 2230 self.store_async_annotation(annotation_dict=annos) 2231 ) 2232 ) 2233 except Exception as e: 2234 raise RuntimeError(f"failed with { repr(e) }.") from e 2235 2236 @tracer.start_as_current_span("SynapseStorage::add_annotations_to_entities_files") 2237 async def add_annotations_to_entities_files( 2238 self, 2239 dmge, 2240 manifest, 2241 manifest_record_type: str, 2242 datasetId: str, 2243 hideBlanks: bool, 2244 manifest_synapse_table_id="", 2245 annotation_keys: str = "class_label", 2246 ): 2247 """ 2248 Depending on upload type add Ids to entityId row. Add anotations to connected 2249 files and folders. Despite the name of this function, it also applies to folders. 2250 2251 Args: 2252 dmge: DataModelGraphExplorer Object 2253 manifest (pd.DataFrame): loaded df containing user supplied data. 2254 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2255 datasetId (str): synapse ID of folder containing the dataset 2256 hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2257 manifest_synapse_table_id (str): Default is an empty string ''. 2258 annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display 2259 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2260 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2261 Returns: 2262 manifest (pd.DataFrame): modified to add entitiyId as appropriate 2263 2264 """ 2265 2266 # Expected behavior is to annotate files if `Filename` is present and if file_annotations_upload is set to True regardless of `-mrt` setting 2267 if "filename" in [col.lower() for col in manifest.columns]: 2268 # get current list of files and store as dataframe 2269 dataset_files = self.getFilesInStorageDataset(datasetId) 2270 files_and_entityIds = self._get_file_entityIds( 2271 dataset_files=dataset_files, only_new_files=False 2272 ) 2273 file_df = pd.DataFrame(files_and_entityIds) 2274 2275 # Merge dataframes to add entityIds 2276 manifest = manifest.merge( 2277 file_df, how="left", on="Filename", suffixes=["_x", None] 2278 ).drop("entityId_x", axis=1) 2279 2280 # Fill `entityId` for each row if missing and annotate entity as appropriate 2281 requests = set() 2282 for idx, row in manifest.iterrows(): 2283 if not row["entityId"] and ( 2284 manifest_record_type == "file_and_entities" 2285 or manifest_record_type == "table_file_and_entities" 2286 ): 2287 manifest, entityId = self._create_entity_id( 2288 idx, row, manifest, datasetId 2289 ) 2290 elif not row["entityId"] and manifest_record_type == "table_and_file": 2291 # If not using entityIds, fill with manifest_table_id so 2292 row["entityId"] = manifest_synapse_table_id 2293 manifest.loc[idx, "entityId"] = manifest_synapse_table_id 2294 entityId = "" 2295 # If the row is the manifest table, do not add annotations 2296 elif row["entityId"] == manifest_synapse_table_id: 2297 entityId = "" 2298 else: 2299 # get the file id of the file to annotate, collected in above step. 2300 entityId = row["entityId"] 2301 2302 # Adding annotations to connected files. 2303 if entityId: 2304 # Format annotations for Synapse 2305 annos_task = asyncio.create_task( 2306 self.format_row_annotations( 2307 dmge, row, entityId, hideBlanks, annotation_keys 2308 ) 2309 ) 2310 requests.add(annos_task) 2311 await self._process_store_annos(requests) 2312 return manifest 2313 2314 @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_table") 2315 def upload_manifest_as_table( 2316 self, 2317 dmge: DataModelGraphExplorer, 2318 manifest: pd.DataFrame, 2319 metadataManifestPath: str, 2320 datasetId: str, 2321 table_name: str, 2322 component_name: str, 2323 restrict: bool, 2324 manifest_record_type: str, 2325 hideBlanks: bool, 2326 table_manipulation: str, 2327 table_column_names: str, 2328 annotation_keys: str, 2329 file_annotations_upload: bool = True, 2330 ): 2331 """Upload manifest to Synapse as a table and csv. 2332 Args: 2333 dmge: DataModelGraphExplorer object 2334 manifest (pd.DataFrame): loaded df containing user supplied data. 2335 metadataManifestPath: path to csv containing a validated metadata manifest. 2336 datasetId (str): synapse ID of folder containing the dataset 2337 table_name (str): Generated to name the table being uploaded. 2338 component_name (str): Name of the component manifest that is currently being uploaded. 2339 restrict (bool): Flag for censored data. 2340 manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2341 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2342 table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2343 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2344 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2345 display label formatting. 2346 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2347 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2348 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2349 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2350 Return: 2351 manifest_synapse_file_id: SynID of manifest csv uploaded to synapse. 2352 """ 2353 # Upload manifest as a table, get the ID and updated manifest. 2354 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2355 dmge=dmge, 2356 manifest=manifest, 2357 datasetId=datasetId, 2358 table_name=table_name, 2359 restrict=restrict, 2360 table_manipulation=table_manipulation, 2361 table_column_names=table_column_names, 2362 ) 2363 2364 if file_annotations_upload: 2365 manifest = asyncio.run( 2366 self.add_annotations_to_entities_files( 2367 dmge, 2368 manifest, 2369 manifest_record_type, 2370 datasetId, 2371 hideBlanks, 2372 manifest_synapse_table_id, 2373 annotation_keys, 2374 ) 2375 ) 2376 # Load manifest to synapse as a CSV File 2377 manifest_synapse_file_id = self.upload_manifest_file( 2378 manifest=manifest, 2379 metadataManifestPath=metadataManifestPath, 2380 datasetId=datasetId, 2381 restrict_manifest=restrict, 2382 component_name=component_name, 2383 ) 2384 2385 # Set annotations for the file manifest. 2386 manifest_annotations = self.format_manifest_annotations( 2387 manifest=manifest, manifest_synapse_id=manifest_synapse_file_id 2388 ) 2389 annos = self.syn.set_annotations(annotations=manifest_annotations) 2390 manifest_entity = self.synapse_entity_tracker.get( 2391 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2392 ) 2393 manifest_entity.annotations = annos 2394 manifest_entity.etag = annos.etag 2395 2396 logger.info("Associated manifest file with dataset on Synapse.") 2397 2398 # Update manifest Synapse table with new entity id column. 2399 manifest_synapse_table_id, manifest, _ = self.uploadDB( 2400 dmge=dmge, 2401 manifest=manifest, 2402 datasetId=datasetId, 2403 table_name=table_name, 2404 restrict=restrict, 2405 table_manipulation="update", 2406 table_column_names=table_column_names, 2407 ) 2408 2409 # Set annotations for the table manifest 2410 manifest_annotations = self.format_manifest_annotations( 2411 manifest=manifest, manifest_synapse_id=manifest_synapse_table_id 2412 ) 2413 annotations_manifest_table = self.syn.set_annotations( 2414 annotations=manifest_annotations 2415 ) 2416 manifest_table_entity = self.synapse_entity_tracker.get( 2417 synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False 2418 ) 2419 manifest_table_entity.annotations = annotations_manifest_table 2420 manifest_table_entity.etag = annotations_manifest_table.etag 2421 2422 return manifest_synapse_file_id 2423 2424 @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_csv") 2425 def upload_manifest_as_csv( 2426 self, 2427 dmge, 2428 manifest, 2429 metadataManifestPath, 2430 datasetId, 2431 restrict, 2432 manifest_record_type, 2433 hideBlanks, 2434 component_name, 2435 annotation_keys: str, 2436 file_annotations_upload: bool = True, 2437 ): 2438 """Upload manifest to Synapse as a csv only. 2439 Args: 2440 dmge: DataModelGraphExplorer object 2441 manifest (pd.DataFrame): loaded df containing user supplied data. 2442 metadataManifestPath: path to csv containing a validated metadata manifest. 2443 datasetId (str): synapse ID of folder containing the dataset 2444 restrict (bool): Flag for censored data. 2445 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2446 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2447 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2448 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2449 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2450 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2451 Return: 2452 manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. 2453 """ 2454 if file_annotations_upload: 2455 manifest = asyncio.run( 2456 self.add_annotations_to_entities_files( 2457 dmge, 2458 manifest, 2459 manifest_record_type, 2460 datasetId, 2461 hideBlanks, 2462 annotation_keys=annotation_keys, 2463 ) 2464 ) 2465 2466 # Load manifest to synapse as a CSV File 2467 manifest_synapse_file_id = self.upload_manifest_file( 2468 manifest, 2469 metadataManifestPath, 2470 datasetId, 2471 restrict, 2472 component_name=component_name, 2473 ) 2474 2475 # Set annotations for the file manifest. 2476 manifest_annotations = self.format_manifest_annotations( 2477 manifest, manifest_synapse_file_id 2478 ) 2479 annos = self.syn.set_annotations(manifest_annotations) 2480 manifest_entity = self.synapse_entity_tracker.get( 2481 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2482 ) 2483 manifest_entity.annotations = annos 2484 manifest_entity.etag = annos.etag 2485 2486 logger.info("Associated manifest file with dataset on Synapse.") 2487 2488 return manifest_synapse_file_id 2489 2490 @tracer.start_as_current_span("SynapseStorage::upload_manifest_combo") 2491 def upload_manifest_combo( 2492 self, 2493 dmge, 2494 manifest, 2495 metadataManifestPath, 2496 datasetId, 2497 table_name, 2498 component_name, 2499 restrict, 2500 manifest_record_type, 2501 hideBlanks, 2502 table_manipulation, 2503 table_column_names: str, 2504 annotation_keys: str, 2505 file_annotations_upload: bool = True, 2506 ): 2507 """Upload manifest to Synapse as a table and CSV with entities. 2508 Args: 2509 dmge: DataModelGraphExplorer object 2510 manifest (pd.DataFrame): loaded df containing user supplied data. 2511 metadataManifestPath: path to csv containing a validated metadata manifest. 2512 datasetId (str): synapse ID of folder containing the dataset 2513 table_name (str): Generated to name the table being uploaded. 2514 component_name (str): Name of the component manifest that is currently being uploaded. 2515 restrict (bool): Flag for censored data. 2516 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2517 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2518 table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2519 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2520 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2521 display label formatting. 2522 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2523 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2524 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2525 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2526 Return: 2527 manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. 2528 """ 2529 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2530 dmge=dmge, 2531 manifest=manifest, 2532 datasetId=datasetId, 2533 table_name=table_name, 2534 restrict=restrict, 2535 table_manipulation=table_manipulation, 2536 table_column_names=table_column_names, 2537 ) 2538 2539 if file_annotations_upload: 2540 manifest = asyncio.run( 2541 self.add_annotations_to_entities_files( 2542 dmge, 2543 manifest, 2544 manifest_record_type, 2545 datasetId, 2546 hideBlanks, 2547 manifest_synapse_table_id, 2548 annotation_keys=annotation_keys, 2549 ) 2550 ) 2551 2552 # Load manifest to synapse as a CSV File 2553 manifest_synapse_file_id = self.upload_manifest_file( 2554 manifest, metadataManifestPath, datasetId, restrict, component_name 2555 ) 2556 2557 # Set annotations for the file manifest. 2558 manifest_annotations = self.format_manifest_annotations( 2559 manifest, manifest_synapse_file_id 2560 ) 2561 file_manifest_annoations = self.syn.set_annotations(manifest_annotations) 2562 manifest_entity = self.synapse_entity_tracker.get( 2563 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2564 ) 2565 manifest_entity.annotations = file_manifest_annoations 2566 manifest_entity.etag = file_manifest_annoations.etag 2567 logger.info("Associated manifest file with dataset on Synapse.") 2568 2569 # Update manifest Synapse table with new entity id column. 2570 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2571 dmge=dmge, 2572 manifest=manifest, 2573 datasetId=datasetId, 2574 table_name=table_name, 2575 restrict=restrict, 2576 table_manipulation="update", 2577 table_column_names=table_column_names, 2578 ) 2579 2580 # Set annotations for the table manifest 2581 manifest_annotations = self.format_manifest_annotations( 2582 manifest, manifest_synapse_table_id 2583 ) 2584 table_manifest_annotations = self.syn.set_annotations(manifest_annotations) 2585 manifest_entity = self.synapse_entity_tracker.get( 2586 synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False 2587 ) 2588 manifest_entity.annotations = table_manifest_annotations 2589 manifest_entity.etag = table_manifest_annotations.etag 2590 return manifest_synapse_file_id 2591 2592 @tracer.start_as_current_span("SynapseStorage::associateMetadataWithFiles") 2593 def associateMetadataWithFiles( 2594 self, 2595 dmge: DataModelGraphExplorer, 2596 metadataManifestPath: str, 2597 datasetId: str, 2598 manifest_record_type: str = "table_file_and_entities", 2599 hideBlanks: bool = False, 2600 restrict_manifest=False, 2601 table_manipulation: str = "replace", 2602 table_column_names: str = "class_label", 2603 annotation_keys: str = "class_label", 2604 file_annotations_upload: bool = True, 2605 ) -> str: 2606 """Associate metadata with files in a storage dataset already on Synapse. 2607 Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file. 2608 2609 If this is a new manifest there could be no Synapse entities associated with the rows of this manifest 2610 this may be due to data type (e.g. clinical data) being tabular 2611 and not requiring files; to utilize uniform interfaces downstream 2612 (i.e. fileviews), a Synapse entity (a folder) is created for each row 2613 and an entity column is added to the manifest containing the resulting 2614 entity IDs; a table is also created at present as an additional interface 2615 for downstream query and interaction with the data. 2616 2617 Args: 2618 dmge: DataModelGraphExplorer Object 2619 metadataManifestPath: path to csv containing a validated metadata manifest. 2620 The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type. 2621 Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item. 2622 In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file. 2623 datasetId: synapse ID of folder containing the dataset 2624 manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination. 2625 hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2626 restrict_manifest (bool): Default is false. Flag for censored data. 2627 table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2628 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2629 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2630 display label formatting. 2631 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2632 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2633 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2634 Returns: 2635 manifest_synapse_file_id: SynID of manifest csv uploaded to synapse. 2636 """ 2637 # Read new manifest CSV: 2638 manifest = self._read_manifest(metadataManifestPath) 2639 manifest = self._add_id_columns_to_manifest(manifest, dmge) 2640 2641 table_name, component_name = self._generate_table_name(manifest) 2642 2643 # Upload manifest to synapse based on user input (manifest_record_type) 2644 if manifest_record_type == "file_only": 2645 manifest_synapse_file_id = self.upload_manifest_as_csv( 2646 dmge=dmge, 2647 manifest=manifest, 2648 metadataManifestPath=metadataManifestPath, 2649 datasetId=datasetId, 2650 restrict=restrict_manifest, 2651 hideBlanks=hideBlanks, 2652 manifest_record_type=manifest_record_type, 2653 component_name=component_name, 2654 annotation_keys=annotation_keys, 2655 file_annotations_upload=file_annotations_upload, 2656 ) 2657 elif manifest_record_type == "table_and_file": 2658 manifest_synapse_file_id = self.upload_manifest_as_table( 2659 dmge=dmge, 2660 manifest=manifest, 2661 metadataManifestPath=metadataManifestPath, 2662 datasetId=datasetId, 2663 table_name=table_name, 2664 component_name=component_name, 2665 restrict=restrict_manifest, 2666 hideBlanks=hideBlanks, 2667 manifest_record_type=manifest_record_type, 2668 table_manipulation=table_manipulation, 2669 table_column_names=table_column_names, 2670 annotation_keys=annotation_keys, 2671 file_annotations_upload=file_annotations_upload, 2672 ) 2673 elif manifest_record_type == "file_and_entities": 2674 manifest_synapse_file_id = self.upload_manifest_as_csv( 2675 dmge=dmge, 2676 manifest=manifest, 2677 metadataManifestPath=metadataManifestPath, 2678 datasetId=datasetId, 2679 restrict=restrict_manifest, 2680 hideBlanks=hideBlanks, 2681 manifest_record_type=manifest_record_type, 2682 component_name=component_name, 2683 annotation_keys=annotation_keys, 2684 file_annotations_upload=file_annotations_upload, 2685 ) 2686 elif manifest_record_type == "table_file_and_entities": 2687 manifest_synapse_file_id = self.upload_manifest_combo( 2688 dmge=dmge, 2689 manifest=manifest, 2690 metadataManifestPath=metadataManifestPath, 2691 datasetId=datasetId, 2692 table_name=table_name, 2693 component_name=component_name, 2694 restrict=restrict_manifest, 2695 hideBlanks=hideBlanks, 2696 manifest_record_type=manifest_record_type, 2697 table_manipulation=table_manipulation, 2698 table_column_names=table_column_names, 2699 annotation_keys=annotation_keys, 2700 file_annotations_upload=file_annotations_upload, 2701 ) 2702 else: 2703 raise ValueError("Please enter a valid manifest_record_type.") 2704 return manifest_synapse_file_id 2705 2706 def getTableAnnotations(self, table_id: str): 2707 """Generate dictionary of annotations for the given Synapse file. 2708 Synapse returns all custom annotations as lists since they 2709 can contain multiple values. In all cases, the values will 2710 be converted into strings and concatenated with ", ". 2711 2712 Args: 2713 fileId (str): Synapse ID for dataset file. 2714 2715 Returns: 2716 dict: Annotations as comma-separated strings. 2717 """ 2718 try: 2719 entity = self.synapse_entity_tracker.get( 2720 synapse_id=table_id, syn=self.syn, download_file=False 2721 ) 2722 is_table = entity.concreteType.endswith(".TableEntity") 2723 annotations_raw = entity.annotations 2724 except SynapseHTTPError: 2725 # If an error occurs with retrieving entity, skip it 2726 # This could be caused by a temporary file view that 2727 # was deleted since its ID was retrieved 2728 is_file, is_table = False, False 2729 2730 # Skip anything that isn't a file or folder 2731 if not (is_table): 2732 return None 2733 2734 annotations = self.getEntityAnnotations(table_id, entity, annotations_raw) 2735 2736 return annotations 2737 2738 def getFileAnnotations(self, fileId: str) -> Dict[str, str]: 2739 """Generate dictionary of annotations for the given Synapse file. 2740 Synapse returns all custom annotations as lists since they 2741 can contain multiple values. In all cases, the values will 2742 be converted into strings and concatenated with ", ". 2743 2744 Args: 2745 fileId (str): Synapse ID for dataset file. 2746 2747 Returns: 2748 dict: Annotations as comma-separated strings. 2749 """ 2750 2751 # Get entity metadata, including annotations 2752 try: 2753 entity = self.synapse_entity_tracker.get( 2754 synapse_id=fileId, syn=self.syn, download_file=False 2755 ) 2756 is_file = entity.concreteType.endswith(".FileEntity") 2757 is_folder = entity.concreteType.endswith(".Folder") 2758 annotations_raw = entity.annotations 2759 except SynapseHTTPError: 2760 # If an error occurs with retrieving entity, skip it 2761 # This could be caused by a temporary file view that 2762 # was deleted since its ID was retrieved 2763 is_file, is_folder = False, False 2764 2765 # Skip anything that isn't a file or folder 2766 if not (is_file or is_folder): 2767 return None 2768 2769 annotations = self.getEntityAnnotations(fileId, entity, annotations_raw) 2770 2771 return annotations 2772 2773 def getEntityAnnotations(self, fileId, entity, annotations_raw): 2774 # Extract annotations from their lists and stringify. For example: 2775 # {'YearofBirth': [1980], 'author': ['bruno', 'milen', 'sujay']} 2776 annotations = dict() 2777 for key, vals in annotations_raw.items(): 2778 if isinstance(vals, list) and len(vals) == 1: 2779 annotations[key] = str(vals[0]) 2780 else: 2781 annotations[key] = ", ".join(str(v) for v in vals) 2782 2783 # Add the file entity ID and eTag, which weren't lists 2784 assert fileId == entity.id, ( 2785 "For some reason, the Synapse ID in the response doesn't match" 2786 "the Synapse ID sent in the request (via synapseclient)." 2787 ) 2788 annotations["entityId"] = fileId 2789 annotations["eTag"] = entity.etag 2790 2791 return annotations 2792 2793 def getDatasetAnnotations( 2794 self, datasetId: str, fill_na: bool = True, force_batch: bool = False 2795 ) -> pd.DataFrame: 2796 """Generate table for annotations across all files in given dataset. 2797 2798 Args: 2799 datasetId (str): Synapse ID for dataset folder. 2800 fill_na (bool): Whether to replace missing values with 2801 blank strings. 2802 force_batch (bool): Whether to force the function to use 2803 the batch mode, which uses a file view to retrieve 2804 annotations for a given dataset. Default to False 2805 unless there are more than 50 files in the dataset. 2806 2807 Returns: 2808 pd.DataFrame: Table of annotations. 2809 """ 2810 # Get all files in given dataset 2811 dataset_files = self.getFilesInStorageDataset(datasetId) 2812 2813 # if there are no dataset files, there are no annotations 2814 # return None 2815 if not dataset_files: 2816 return pd.DataFrame() 2817 2818 dataset_files_map = dict(dataset_files) 2819 dataset_file_ids, _ = list(zip(*dataset_files)) 2820 2821 # Get annotations for each file from Step 1 2822 # Batch mode 2823 try_batch = len(dataset_files) >= 50 or force_batch 2824 if try_batch: 2825 try: 2826 logger.info("Trying batch mode for retrieving Synapse annotations") 2827 table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids) 2828 except (SynapseAuthenticationError, SynapseHTTPError, ValueError): 2829 logger.info( 2830 f"Unable to create a temporary file view bound to {datasetId}. " 2831 "Defaulting to slower iterative retrieval of annotations." 2832 ) 2833 # Default to the slower non-batch method 2834 logger.info("Batch mode failed (probably due to permission error)") 2835 try_batch = False 2836 2837 # Non-batch mode 2838 if not try_batch: 2839 logger.info("Using slower (non-batch) sequential mode") 2840 records = [self.getFileAnnotations(i) for i in dataset_file_ids] 2841 # Remove any annotations for non-file/folders (stored as None) 2842 records = filter(None, records) 2843 table = pd.DataFrame.from_records(records) 2844 2845 # Add filenames for the files that "survived" annotation retrieval 2846 filenames = [dataset_files_map[i] for i in table["entityId"]] 2847 2848 if "Filename" not in table.columns: 2849 table.insert(0, "Filename", filenames) 2850 2851 # Ensure that entityId and eTag are at the end 2852 entity_ids = table.pop("entityId") 2853 etags = table.pop("eTag") 2854 table.insert(len(table.columns), "entityId", entity_ids) 2855 table.insert(len(table.columns), "eTag", etags) 2856 2857 # Missing values are filled in with empty strings for Google Sheets 2858 if fill_na: 2859 table.fillna("", inplace=True) 2860 2861 # Force all values as strings 2862 return table.astype(str) 2863 2864 def raise_final_error(retry_state): 2865 return retry_state.outcome.result() 2866 2867 def checkIfinAssetView(self, syn_id) -> str: 2868 # get data in administrative fileview for this pipeline 2869 assetViewTable = self.getStorageFileviewTable() 2870 all_files = list(assetViewTable["id"]) 2871 if syn_id in all_files: 2872 return True 2873 else: 2874 return False 2875 2876 @tracer.start_as_current_span("SynapseStorage::getDatasetProject") 2877 @retry( 2878 stop=stop_after_attempt(5), 2879 wait=wait_chain( 2880 *[wait_fixed(10) for i in range(2)] 2881 + [wait_fixed(15) for i in range(2)] 2882 + [wait_fixed(20)] 2883 ), 2884 retry=retry_if_exception_type(LookupError), 2885 retry_error_callback=raise_final_error, 2886 ) 2887 def getDatasetProject(self, datasetId: str) -> str: 2888 """Get parent project for a given dataset ID. 2889 2890 Args: 2891 datasetId (str): Synapse entity ID (folder or project). 2892 2893 Raises: 2894 ValueError: Raised if Synapse ID cannot be retrieved 2895 by the user or if it doesn't appear in the file view. 2896 2897 Returns: 2898 str: The Synapse ID for the parent project. 2899 """ 2900 2901 # Subset main file view 2902 dataset_index = self.storageFileviewTable["id"] == datasetId 2903 dataset_row = self.storageFileviewTable[dataset_index] 2904 2905 # re-query if no datasets found 2906 if dataset_row.empty: 2907 sleep(5) 2908 self.query_fileview(force_requery=True) 2909 # Subset main file view 2910 dataset_index = self.storageFileviewTable["id"] == datasetId 2911 dataset_row = self.storageFileviewTable[dataset_index] 2912 2913 # Return `projectId` for given row if only one found 2914 if len(dataset_row) == 1: 2915 dataset_project = dataset_row["projectId"].values[0] 2916 return dataset_project 2917 2918 # Otherwise, check if already project itself 2919 try: 2920 syn_object = self.synapse_entity_tracker.get( 2921 synapse_id=datasetId, syn=self.syn, download_file=False 2922 ) 2923 if syn_object.properties["concreteType"].endswith("Project"): 2924 return datasetId 2925 except SynapseHTTPError: 2926 raise PermissionError( 2927 f"The given dataset ({datasetId}) isn't accessible with this " 2928 "user. This might be caused by a typo in the dataset Synapse ID." 2929 ) 2930 2931 # If not, then assume dataset not in file view 2932 raise LookupError( 2933 f"The given dataset ({datasetId}) doesn't appear in the " 2934 f"configured file view ({self.storageFileview}). This might " 2935 "mean that the file view's scope needs to be updated." 2936 ) 2937 2938 def getDatasetAnnotationsBatch( 2939 self, datasetId: str, dataset_file_ids: Sequence[str] = None 2940 ) -> pd.DataFrame: 2941 """Generate table for annotations across all files in given dataset. 2942 This function uses a temporary file view to generate a table 2943 instead of iteratively querying for individual entity annotations. 2944 This function is expected to run much faster than 2945 `self.getDatasetAnnotationsBatch` on large datasets. 2946 2947 Args: 2948 datasetId (str): Synapse ID for dataset folder. 2949 dataset_file_ids (Sequence[str]): List of Synapse IDs 2950 for dataset files/folders used to subset the table. 2951 2952 Returns: 2953 pd.DataFrame: Table of annotations. 2954 """ 2955 # Create data frame from annotations file view 2956 with DatasetFileView(datasetId, self.syn) as fileview: 2957 table = fileview.query() 2958 2959 if dataset_file_ids: 2960 table = table.loc[table.index.intersection(dataset_file_ids)] 2961 2962 table = table.reset_index(drop=True) 2963 2964 return table 2965 2966 def _get_table_schema_by_cname(self, table_schema): 2967 # assume no duplicate column names in the table 2968 table_schema_by_cname = {} 2969 2970 for col_record in table_schema: 2971 # TODO clean up dictionary for compactness (e.g. remove redundant 'name' key) 2972 table_schema_by_cname[col_record["name"]] = col_record 2973 2974 return table_schema_by_cname 2975 2976 2977class TableOperations: 2978 """ 2979 Object to hold functions for various table operations specific to the Synapse Asset Store. 2980 2981 Currently implement operations are: 2982 createTable: upload a manifest as a new table when none exist 2983 replaceTable: replace a metadata in a table from one manifest with metadata from another manifest 2984 updateTable: add a column to a table that already exists on synapse 2985 2986 Operations currently in development are: 2987 upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest 2988 """ 2989 2990 def __init__( 2991 self, 2992 synStore: SynapseStorage, 2993 tableToLoad: pd.DataFrame = None, 2994 tableName: str = None, 2995 datasetId: str = None, 2996 existingTableId: str = None, 2997 restrict: bool = False, 2998 synapse_entity_tracker: SynapseEntityTracker = None, 2999 ): 3000 """ 3001 Class governing table operations (creation, replacement, upserts, updates) in schematic 3002 3003 tableToLoad: manifest formatted appropriately for the table 3004 tableName: name of the table to be uploaded 3005 datasetId: synID of the dataset for the manifest 3006 existingTableId: synId of the table currently exising on synapse (if there is one) 3007 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 3008 synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities 3009 3010 """ 3011 self.synStore = synStore 3012 self.tableToLoad = tableToLoad 3013 self.tableName = tableName 3014 self.datasetId = datasetId 3015 self.existingTableId = existingTableId 3016 self.restrict = restrict 3017 self.synapse_entity_tracker = synapse_entity_tracker or SynapseEntityTracker() 3018 3019 @tracer.start_as_current_span("TableOperations::createTable") 3020 def createTable( 3021 self, 3022 columnTypeDict: dict = None, 3023 specifySchema: bool = True, 3024 ): 3025 """ 3026 Method to create a table from a metadata manifest and upload it to synapse 3027 3028 Args: 3029 columnTypeDict: dictionary schema for table columns: type, size, etc 3030 specifySchema: to specify a specific schema for the table format 3031 3032 Returns: 3033 table.schema.id: synID of the newly created table 3034 """ 3035 datasetEntity = self.synapse_entity_tracker.get( 3036 synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False 3037 ) 3038 datasetName = datasetEntity.name 3039 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3040 3041 if not self.tableName: 3042 self.tableName = datasetName + "table" 3043 datasetParentProject = self.synStore.getDatasetProject(self.datasetId) 3044 if specifySchema: 3045 if columnTypeDict == {}: 3046 logger.error("Did not provide a columnTypeDict.") 3047 # create list of columns: 3048 cols = [] 3049 for col in self.tableToLoad.columns: 3050 if col in table_schema_by_cname: 3051 col_type = table_schema_by_cname[col]["columnType"] 3052 max_size = ( 3053 table_schema_by_cname[col]["maximumSize"] 3054 if "maximumSize" in table_schema_by_cname[col].keys() 3055 else 100 3056 ) 3057 max_list_len = 250 3058 if max_size and max_list_len: 3059 cols.append( 3060 Column( 3061 name=col, 3062 columnType=col_type, 3063 maximumSize=max_size, 3064 maximumListLength=max_list_len, 3065 ) 3066 ) 3067 elif max_size: 3068 cols.append( 3069 Column(name=col, columnType=col_type, maximumSize=max_size) 3070 ) 3071 else: 3072 cols.append(Column(name=col, columnType=col_type)) 3073 else: 3074 # TODO add warning that the given col was not found and it's max size is set to 100 3075 cols.append(Column(name=col, columnType="STRING", maximumSize=100)) 3076 schema = Schema( 3077 name=self.tableName, columns=cols, parent=datasetParentProject 3078 ) 3079 table = Table(schema, self.tableToLoad) 3080 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3081 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3082 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3083 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3084 return table.schema.id 3085 else: 3086 # For just uploading the tables to synapse using default 3087 # column types. 3088 table = build_table(self.tableName, datasetParentProject, self.tableToLoad) 3089 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3090 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3091 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3092 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3093 return table.schema.id 3094 3095 @tracer.start_as_current_span("TableOperations::replaceTable") 3096 def replaceTable( 3097 self, 3098 specifySchema: bool = True, 3099 columnTypeDict: dict = None, 3100 ): 3101 """ 3102 Method to replace an existing table on synapse with metadata from a new manifest 3103 3104 Args: 3105 specifySchema: to infer a schema for the table format 3106 columnTypeDict: dictionary schema for table columns: type, size, etc 3107 3108 Returns: 3109 existingTableId: synID of the already existing table that had its metadata replaced 3110 """ 3111 datasetEntity = self.synapse_entity_tracker.get( 3112 synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False 3113 ) 3114 3115 datasetName = datasetEntity.name 3116 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3117 existing_table, existing_results = self.synStore.get_synapse_table( 3118 self.existingTableId 3119 ) 3120 # remove rows 3121 self.synStore.syn.delete(existing_results) 3122 # Data changes such as removing all rows causes the eTag to change. 3123 self.synapse_entity_tracker.remove(synapse_id=self.existingTableId) 3124 # wait for row deletion to finish on synapse before getting empty table 3125 sleep(10) 3126 3127 # removes all current columns 3128 current_table = self.synapse_entity_tracker.get( 3129 synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False 3130 ) 3131 3132 current_columns = self.synStore.syn.getTableColumns(current_table) 3133 for col in current_columns: 3134 current_table.removeColumn(col) 3135 3136 if not self.tableName: 3137 self.tableName = datasetName + "table" 3138 3139 # Process columns according to manifest entries 3140 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3141 datasetParentProject = self.synStore.getDatasetProject(self.datasetId) 3142 if specifySchema: 3143 if columnTypeDict == {}: 3144 logger.error("Did not provide a columnTypeDict.") 3145 # create list of columns: 3146 cols = [] 3147 3148 for col in self.tableToLoad.columns: 3149 if col in table_schema_by_cname: 3150 col_type = table_schema_by_cname[col]["columnType"] 3151 max_size = ( 3152 table_schema_by_cname[col]["maximumSize"] 3153 if "maximumSize" in table_schema_by_cname[col].keys() 3154 else 100 3155 ) 3156 max_list_len = 250 3157 if max_size and max_list_len: 3158 cols.append( 3159 Column( 3160 name=col, 3161 columnType=col_type, 3162 maximumSize=max_size, 3163 maximumListLength=max_list_len, 3164 ) 3165 ) 3166 elif max_size: 3167 cols.append( 3168 Column(name=col, columnType=col_type, maximumSize=max_size) 3169 ) 3170 else: 3171 cols.append(Column(name=col, columnType=col_type)) 3172 else: 3173 # TODO add warning that the given col was not found and it's max size is set to 100 3174 cols.append(Column(name=col, columnType="STRING", maximumSize=100)) 3175 3176 # adds new columns to schema 3177 for col in cols: 3178 current_table.addColumn(col) 3179 table_result = self.synStore.syn.store( 3180 current_table, isRestricted=self.restrict 3181 ) 3182 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3183 # self.synapse_entity_tracker.add(synapse_id=table_result.schema.id, entity=table_result.schema) 3184 self.synapse_entity_tracker.remove(synapse_id=table_result.id) 3185 3186 # wait for synapse store to finish 3187 sleep(1) 3188 3189 # build schema and table from columns and store with necessary restrictions 3190 schema = Schema( 3191 name=self.tableName, columns=cols, parent=datasetParentProject 3192 ) 3193 schema.id = self.existingTableId 3194 table = Table(schema, self.tableToLoad, etag=existing_results.etag) 3195 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3196 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3197 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3198 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3199 else: 3200 logging.error("Must specify a schema for table replacements") 3201 3202 # remove system metadata from manifest 3203 existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True) 3204 return self.existingTableId 3205 3206 @tracer.start_as_current_span("TableOperations::_get_auth_token") 3207 def _get_auth_token( 3208 self, 3209 ): 3210 authtoken = None 3211 3212 # Get access token from environment variable if available 3213 # Primarily useful for testing environments, with other possible usefulness for containers 3214 env_access_token = os.getenv("SYNAPSE_ACCESS_TOKEN") 3215 if env_access_token: 3216 authtoken = env_access_token 3217 return authtoken 3218 3219 # Get token from authorization header 3220 # Primarily useful for API endpoint functionality 3221 if "Authorization" in self.synStore.syn.default_headers: 3222 authtoken = self.synStore.syn.default_headers["Authorization"].split( 3223 "Bearer " 3224 )[-1] 3225 return authtoken 3226 3227 # retrive credentials from synapse object 3228 # Primarily useful for local users, could only be stored here when a .synapseConfig file is used, but including to be safe 3229 synapse_object_creds = self.synStore.syn.credentials 3230 if hasattr(synapse_object_creds, "_token"): 3231 authtoken = synapse_object_creds.secret 3232 3233 # Try getting creds from .synapseConfig file if it exists 3234 # Primarily useful for local users. Seems to correlate with credentials stored in synaspe object when logged in 3235 if os.path.exists(CONFIG.synapse_configuration_path): 3236 config = get_config_file(CONFIG.synapse_configuration_path) 3237 3238 # check which credentials are provided in file 3239 if config.has_option("authentication", "authtoken"): 3240 authtoken = config.get("authentication", "authtoken") 3241 3242 # raise error if required credentials are not found 3243 if not authtoken: 3244 raise NameError( 3245 "authtoken credentials could not be found in the environment, synapse object, or the .synapseConfig file" 3246 ) 3247 3248 return authtoken 3249 3250 @tracer.start_as_current_span("TableOperations::upsertTable") 3251 def upsertTable(self, dmge: DataModelGraphExplorer): 3252 """ 3253 Method to upsert rows from a new manifest into an existing table on synapse 3254 For upsert functionality to work, primary keys must follow the naming convention of <componenet>_id 3255 `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality. 3256 Currently it is required to use -dl/--use_display_label with table upserts. 3257 3258 3259 Args: 3260 dmge: DataModelGraphExplorer instance 3261 3262 Returns: 3263 existingTableId: synID of the already existing table that had its metadata replaced 3264 """ 3265 3266 authtoken = self._get_auth_token() 3267 3268 synapseDB = SynapseDatabase( 3269 auth_token=authtoken, 3270 project_id=self.synStore.getDatasetProject(self.datasetId), 3271 syn=self.synStore.syn, 3272 synapse_entity_tracker=self.synapse_entity_tracker, 3273 ) 3274 3275 try: 3276 # Try performing upsert 3277 synapseDB.upsert_table_rows( 3278 table_name=self.tableName, data=self.tableToLoad 3279 ) 3280 except SynapseHTTPError as ex: 3281 # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload 3282 if "Id is not a valid column name or id" in str(ex): 3283 self._update_table_uuid_column(dmge) 3284 synapseDB.upsert_table_rows( 3285 table_name=self.tableName, data=self.tableToLoad 3286 ) 3287 # Raise if other error 3288 else: 3289 raise ex 3290 3291 return self.existingTableId 3292 3293 @tracer.start_as_current_span("TableOperations::_update_table_uuid_column") 3294 def _update_table_uuid_column( 3295 self, 3296 dmge: DataModelGraphExplorer, 3297 ) -> None: 3298 """Removes the `Uuid` column when present, and relpaces with an `Id` column 3299 Used to enable backwards compatability for manifests using the old `Uuid` convention 3300 3301 Args: 3302 dmge: DataModelGraphExplorer instance 3303 3304 Returns: 3305 None 3306 """ 3307 3308 # Get the columns of the schema 3309 schema = self.synapse_entity_tracker.get( 3310 synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False 3311 ) 3312 3313 cols = self.synStore.syn.getTableColumns(schema) 3314 3315 # Iterate through columns until `Uuid` column is found 3316 for col in cols: 3317 if col.name.lower() == "uuid": 3318 # See if schema has `Uuid` column specified 3319 try: 3320 uuid_col_in_schema = dmge.is_class_in_schema(col.name) 3321 except KeyError: 3322 uuid_col_in_schema = False 3323 3324 # If there is, then create a new `Id` column from scratch 3325 if uuid_col_in_schema: 3326 new_col = Column(columnType="STRING", maximumSize=64, name="Id") 3327 schema.addColumn(new_col) 3328 schema = self.synStore.syn.store(schema) 3329 # self.synapse_entity_tracker.add(synapse_id=schema.id, entity=schema) 3330 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3331 self.synapse_entity_tracker.remove(synapse_id=schema.id) 3332 # If there is not, then use the old `Uuid` column as a basis for the new `Id` column 3333 else: 3334 # Build ColumnModel that will be used for new column 3335 id_column = Column( 3336 name="Id", 3337 columnType="STRING", 3338 maximumSize=64, 3339 defaultValue=None, 3340 maximumListLength=1, 3341 ) 3342 new_col_response = self.synStore.syn.store(id_column) 3343 3344 # Define columnChange body 3345 columnChangeDict = { 3346 "concreteType": "org.sagebionetworks.repo.model.table.TableSchemaChangeRequest", 3347 "entityId": self.existingTableId, 3348 "changes": [ 3349 { 3350 "oldColumnId": col["id"], 3351 "newColumnId": new_col_response["id"], 3352 } 3353 ], 3354 } 3355 3356 self.synStore.syn._async_table_update( 3357 table=self.existingTableId, 3358 changes=[columnChangeDict], 3359 wait=False, 3360 ) 3361 break 3362 3363 return 3364 3365 @tracer.start_as_current_span("TableOperations::updateTable") 3366 def updateTable( 3367 self, 3368 update_col: str = "Id", 3369 ): 3370 """ 3371 Method to update an existing table with a new column 3372 3373 Args: 3374 updateCol: column to index the old and new tables on 3375 3376 Returns: 3377 existingTableId: synID of the already existing table that had its metadata replaced 3378 """ 3379 existing_table, existing_results = self.synStore.get_synapse_table( 3380 self.existingTableId 3381 ) 3382 3383 self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col) 3384 # store table with existing etag data and impose restrictions as appropriate 3385 table_result = self.synStore.syn.store( 3386 Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag), 3387 isRestricted=self.restrict, 3388 ) 3389 # We cannot store the Table to the `synapse_entity_tracker` because there is 3390 # not `Schema` on the table object. The above `.store()` function call would 3391 # also update the ETag of the entity within Synapse. Remove it from the tracker 3392 # and re-retrieve it later on if needed again. 3393 self.synapse_entity_tracker.remove(synapse_id=table_result.tableId) 3394 3395 return self.existingTableId 3396 3397 3398class DatasetFileView: 3399 """Helper class to create temporary dataset file views. 3400 This class can be used in conjunction with a 'with' statement. 3401 This will ensure that the file view is deleted automatically. 3402 See SynapseStorage.getDatasetAnnotationsBatch for example usage. 3403 """ 3404 3405 def __init__( 3406 self, 3407 datasetId: str, 3408 synapse: Synapse, 3409 name: str = None, 3410 temporary: bool = True, 3411 parentId: str = None, 3412 ) -> None: 3413 """Create a file view scoped to a dataset folder. 3414 3415 Args: 3416 datasetId (str): Synapse ID for a dataset folder/project. 3417 synapse (Synapse): Used for Synapse requests. 3418 name (str): Name of the file view (temporary or not). 3419 temporary (bool): Whether to delete the file view on exit 3420 of either a 'with' statement or Python entirely. 3421 parentId (str, optional): Synapse ID specifying where to 3422 store the file view. Defaults to datasetId. 3423 """ 3424 3425 self.datasetId = datasetId 3426 self.synapse = synapse 3427 self.is_temporary = temporary 3428 3429 if name is None: 3430 self.name = f"schematic annotation file view for {self.datasetId}" 3431 3432 if self.is_temporary: 3433 uid = secrets.token_urlsafe(5) 3434 self.name = f"{self.name} - UID {uid}" 3435 3436 # TODO: Allow a DCC admin to configure a "universal parent" 3437 # Such as a Synapse project writeable by everyone. 3438 self.parentId = datasetId if parentId is None else parentId 3439 3440 # TODO: Create local sharing setting to hide from everyone else 3441 view_schema = EntityViewSchema( 3442 name=self.name, 3443 parent=self.parentId, 3444 scopes=self.datasetId, 3445 includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER], 3446 addDefaultViewColumns=False, 3447 addAnnotationColumns=True, 3448 ) 3449 3450 # TODO: Handle failure due to insufficient permissions by 3451 # creating a temporary new project to store view 3452 self.view_schema = self.synapse.store(view_schema) 3453 3454 # These are filled in after calling `self.query()` 3455 self.results = None 3456 self.table = None 3457 3458 # Ensure deletion of the file view (last resort) 3459 if self.is_temporary: 3460 atexit.register(self.delete) 3461 3462 def __enter__(self): 3463 """Return file view when entering 'with' statement.""" 3464 return self 3465 3466 def __exit__(self, exc_type, exc_value, traceback): 3467 """Delete file view when exiting 'with' statement.""" 3468 if self.is_temporary: 3469 self.delete() 3470 3471 def delete(self): 3472 """Delete the file view on Synapse without deleting local table.""" 3473 if self.view_schema is not None: 3474 self.synapse.delete(self.view_schema) 3475 self.view_schema = None 3476 3477 def query(self, tidy=True, force=False): 3478 """Retrieve file view as a data frame (raw format sans index).""" 3479 if self.table is None or force: 3480 fileview_id = self.view_schema["id"] 3481 self.results = self.synapse.tableQuery(f"select * from {fileview_id}") 3482 self.table = self.results.asDataFrame( 3483 rowIdAndVersionInIndex=False, 3484 na_values=STR_NA_VALUES_FILTERED, 3485 keep_default_na=False, 3486 ) 3487 if tidy: 3488 self.tidy_table() 3489 return self.table 3490 3491 def tidy_table(self): 3492 """Convert raw file view data frame into more usable format.""" 3493 assert self.table is not None, "Must call `self.query()` first." 3494 self._fix_default_columns() 3495 self._fix_list_columns() 3496 self._fix_int_columns() 3497 return self.table 3498 3499 def _fix_default_columns(self): 3500 """Rename default columns to match schematic expectations.""" 3501 3502 # Drop ROW_VERSION column if present 3503 if "ROW_VERSION" in self.table: 3504 del self.table["ROW_VERSION"] 3505 3506 # Rename id column to entityId and set as data frame index 3507 if "ROW_ID" in self.table: 3508 self.table["entityId"] = "syn" + self.table["ROW_ID"].astype(str) 3509 self.table = self.table.set_index("entityId", drop=False) 3510 del self.table["ROW_ID"] 3511 3512 # Rename ROW_ETAG column to eTag and place at end of data frame 3513 if "ROW_ETAG" in self.table: 3514 row_etags = self.table.pop("ROW_ETAG") 3515 3516 # eTag column may already present if users annotated data without submitting manifest 3517 # we're only concerned with the new values and not the existing ones 3518 if "eTag" in self.table: 3519 del self.table["eTag"] 3520 3521 self.table.insert(len(self.table.columns), "eTag", row_etags) 3522 3523 return self.table 3524 3525 def _get_columns_of_type(self, types): 3526 """Helper function to get list of columns of a given type(s).""" 3527 matching_columns = [] 3528 for header in self.results.headers: 3529 if header.columnType in types: 3530 matching_columns.append(header.name) 3531 return matching_columns 3532 3533 def _fix_list_columns(self): 3534 """Fix formatting of list-columns.""" 3535 list_types = {"STRING_LIST", "INTEGER_LIST", "BOOLEAN_LIST"} 3536 list_columns = self._get_columns_of_type(list_types) 3537 for col in list_columns: 3538 self.table[col] = self.table[col].apply(lambda x: ", ".join(x)) 3539 return self.table 3540 3541 def _fix_int_columns(self): 3542 """Ensure that integer-columns are actually integers.""" 3543 int_columns = self._get_columns_of_type({"INTEGER"}) 3544 for col in int_columns: 3545 # Coercing to string because NaN is a floating point value 3546 # and cannot exist alongside integers in a column 3547 def to_int_fn(x): 3548 return "" if np.isnan(x) else str(int(x)) 3549 3550 self.table[col] = self.table[col].apply(to_int_fn) 3551 return self.table
85@dataclass 86class ManifestDownload(object): 87 """ 88 syn: an object of type synapseclient. 89 manifest_id: id of a manifest 90 synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities 91 """ 92 93 syn: synapseclient.Synapse 94 manifest_id: str 95 synapse_entity_tracker: SynapseEntityTracker = field( 96 default_factory=SynapseEntityTracker 97 ) 98 99 def _download_manifest_to_folder(self, use_temporary_folder: bool = True) -> File: 100 """ 101 Try downloading a manifest to a specific folder (temporary or not). When the 102 `use_temporary_folder` is set to True, the manifest will be downloaded to a 103 temporary folder. This is useful for when the code is running as an API server 104 where multiple requests are being made at the same time. This will prevent 105 multiple requests from overwriting the same manifest file. When the 106 `use_temporary_folder` is set to False, the manifest will be downloaded to the 107 default manifest folder. 108 109 Args: 110 use_temporary_folder: boolean argument indicating if a temporary folder 111 should be used to store the manifest file. This is useful when running 112 this code as an API server where multiple requests could be made at the 113 same time. This is set to False when the code is being used from the 114 CLI. Defaults to True. 115 116 Return: 117 manifest_data: A Synapse file entity of the downloaded manifest 118 """ 119 manifest_data = self.synapse_entity_tracker.get( 120 synapse_id=self.manifest_id, 121 syn=self.syn, 122 download_file=False, 123 retrieve_if_not_present=False, 124 ) 125 current_span = trace.get_current_span() 126 if ( 127 manifest_data 128 and (file_handle := manifest_data.get("_file_handle", None)) 129 and current_span.is_recording() 130 ): 131 current_span.set_attribute( 132 "schematic.manifest_size", file_handle.get("contentSize", 0) 133 ) 134 135 if manifest_data and manifest_data.path: 136 return manifest_data 137 138 if "SECRETS_MANAGER_SECRETS" in os.environ: 139 temporary_manifest_storage = "/var/tmp/temp_manifest_download" 140 cleanup_temporary_storage( 141 temporary_manifest_storage, time_delta_seconds=3600 142 ) 143 # create a new directory to store manifest 144 if not os.path.exists(temporary_manifest_storage): 145 os.mkdir(temporary_manifest_storage) 146 # create temporary folders for storing manifests 147 download_location = create_temp_folder( 148 path=temporary_manifest_storage, 149 prefix=f"{self.manifest_id}-{time.time()}-", 150 ) 151 else: 152 if use_temporary_folder: 153 download_location = create_temp_folder( 154 path=CONFIG.manifest_folder, 155 prefix=f"{self.manifest_id}-{time.time()}-", 156 ) 157 else: 158 download_location = CONFIG.manifest_folder 159 160 manifest_data = self.synapse_entity_tracker.get( 161 synapse_id=self.manifest_id, 162 syn=self.syn, 163 download_file=True, 164 retrieve_if_not_present=True, 165 download_location=download_location, 166 ) 167 168 # This is doing a rename of the downloaded file. The reason this is important 169 # is that if we are re-using a file that was previously downloaded, but the 170 # file had been renamed. The file downloaded from the Synapse client is just 171 # a direct copy of that renamed file. This code will set the name of the file 172 # to the original name that was used to download the file. Note: An MD5 checksum 173 # of the file will still be performed so if the file has changed, it will be 174 # downloaded again. 175 filename = manifest_data._file_handle.fileName 176 if filename != os.path.basename(manifest_data.path): 177 parent_folder = os.path.dirname(manifest_data.path) 178 manifest_original_name_and_path = os.path.join(parent_folder, filename) 179 180 self.syn.cache.remove( 181 file_handle_id=manifest_data.dataFileHandleId, path=manifest_data.path 182 ) 183 os.rename(manifest_data.path, manifest_original_name_and_path) 184 manifest_data.path = manifest_original_name_and_path 185 self.syn.cache.add( 186 file_handle_id=manifest_data.dataFileHandleId, 187 path=manifest_original_name_and_path, 188 md5=manifest_data._file_handle.contentMd5, 189 ) 190 191 return manifest_data 192 193 def _entity_type_checking(self) -> str: 194 """ 195 check the entity type of the id that needs to be downloaded 196 Return: 197 if the entity type is wrong, raise an error 198 """ 199 # check the type of entity 200 entity_type = entity_type_mapping( 201 syn=self.syn, 202 entity_id=self.manifest_id, 203 synapse_entity_tracker=self.synapse_entity_tracker, 204 ) 205 if entity_type != "file": 206 logger.error( 207 f"You are using entity type: {entity_type}. Please provide a file ID" 208 ) 209 210 def download_manifest( 211 self, 212 newManifestName: str = "", 213 manifest_df: pd.DataFrame = pd.DataFrame(), 214 use_temporary_folder: bool = True, 215 ) -> Union[str, File]: 216 """ 217 Download a manifest based on a given manifest id. 218 Args: 219 newManifestName(optional): new name of a manifest that gets downloaded. 220 manifest_df(optional): a dataframe containing name and id of manifests in a given asset view 221 Return: 222 manifest_data: synapse entity file object 223 """ 224 225 # enables retrying if user does not have access to uncensored manifest 226 # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location 227 manifest_data = "" 228 229 # check entity type 230 self._entity_type_checking() 231 232 # download a manifest 233 try: 234 manifest_data = self._download_manifest_to_folder( 235 use_temporary_folder=use_temporary_folder 236 ) 237 except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError): 238 # if there's an error getting an uncensored manifest, try getting the censored manifest 239 if not manifest_df.empty: 240 censored_regex = re.compile(".*censored.*") 241 censored = manifest_df["name"].str.contains(censored_regex) 242 new_manifest_id = manifest_df[censored]["id"][0] 243 self.manifest_id = new_manifest_id 244 try: 245 manifest_data = self._download_manifest_to_folder( 246 use_temporary_folder=use_temporary_folder 247 ) 248 except ( 249 SynapseUnmetAccessRestrictions, 250 SynapseAuthenticationError, 251 ) as e: 252 raise PermissionError( 253 "You don't have access to censored and uncensored manifests in this dataset." 254 ) from e 255 else: 256 logger.error( 257 f"You don't have access to the requested resource: {self.manifest_id}" 258 ) 259 260 if newManifestName and os.path.exists(manifest_data.get("path")): 261 # Rename the file we just made to the new name 262 new_manifest_filename = newManifestName + ".csv" 263 264 # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest. 265 parent_folder = os.path.dirname(manifest_data.get("path")) 266 267 new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename) 268 269 # Copy file to new location. The purpose of using a copy instead of a rename 270 # is to avoid any potential issues with the file being used in another 271 # process. This avoids any potential race or code cocurrency conditions. 272 shutil.copyfile(src=manifest_data["path"], dst=new_manifest_path_name) 273 274 # Adding this to cache will allow us to re-use the already downloaded 275 # manifest file for up to 1 hour. 276 self.syn.cache.add( 277 file_handle_id=manifest_data.dataFileHandleId, 278 path=new_manifest_path_name, 279 md5=manifest_data._file_handle.contentMd5, 280 ) 281 282 # Update file names/paths in manifest_data 283 manifest_data["name"] = new_manifest_filename 284 manifest_data["filename"] = new_manifest_filename 285 manifest_data["path"] = new_manifest_path_name 286 287 return manifest_data
syn: an object of type synapseclient. manifest_id: id of a manifest synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
210 def download_manifest( 211 self, 212 newManifestName: str = "", 213 manifest_df: pd.DataFrame = pd.DataFrame(), 214 use_temporary_folder: bool = True, 215 ) -> Union[str, File]: 216 """ 217 Download a manifest based on a given manifest id. 218 Args: 219 newManifestName(optional): new name of a manifest that gets downloaded. 220 manifest_df(optional): a dataframe containing name and id of manifests in a given asset view 221 Return: 222 manifest_data: synapse entity file object 223 """ 224 225 # enables retrying if user does not have access to uncensored manifest 226 # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location 227 manifest_data = "" 228 229 # check entity type 230 self._entity_type_checking() 231 232 # download a manifest 233 try: 234 manifest_data = self._download_manifest_to_folder( 235 use_temporary_folder=use_temporary_folder 236 ) 237 except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError): 238 # if there's an error getting an uncensored manifest, try getting the censored manifest 239 if not manifest_df.empty: 240 censored_regex = re.compile(".*censored.*") 241 censored = manifest_df["name"].str.contains(censored_regex) 242 new_manifest_id = manifest_df[censored]["id"][0] 243 self.manifest_id = new_manifest_id 244 try: 245 manifest_data = self._download_manifest_to_folder( 246 use_temporary_folder=use_temporary_folder 247 ) 248 except ( 249 SynapseUnmetAccessRestrictions, 250 SynapseAuthenticationError, 251 ) as e: 252 raise PermissionError( 253 "You don't have access to censored and uncensored manifests in this dataset." 254 ) from e 255 else: 256 logger.error( 257 f"You don't have access to the requested resource: {self.manifest_id}" 258 ) 259 260 if newManifestName and os.path.exists(manifest_data.get("path")): 261 # Rename the file we just made to the new name 262 new_manifest_filename = newManifestName + ".csv" 263 264 # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest. 265 parent_folder = os.path.dirname(manifest_data.get("path")) 266 267 new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename) 268 269 # Copy file to new location. The purpose of using a copy instead of a rename 270 # is to avoid any potential issues with the file being used in another 271 # process. This avoids any potential race or code cocurrency conditions. 272 shutil.copyfile(src=manifest_data["path"], dst=new_manifest_path_name) 273 274 # Adding this to cache will allow us to re-use the already downloaded 275 # manifest file for up to 1 hour. 276 self.syn.cache.add( 277 file_handle_id=manifest_data.dataFileHandleId, 278 path=new_manifest_path_name, 279 md5=manifest_data._file_handle.contentMd5, 280 ) 281 282 # Update file names/paths in manifest_data 283 manifest_data["name"] = new_manifest_filename 284 manifest_data["filename"] = new_manifest_filename 285 manifest_data["path"] = new_manifest_path_name 286 287 return manifest_data
Download a manifest based on a given manifest id.
Arguments:
- newManifestName(optional): new name of a manifest that gets downloaded.
- manifest_df(optional): a dataframe containing name and id of manifests in a given asset view
Return:
manifest_data: synapse entity file object
290class SynapseStorage(BaseStorage): 291 """Implementation of Storage interface for datasets/files stored on Synapse. 292 Provides utilities to list files in a specific project; update files annotations, create fileviews, etc. 293 294 TODO: Need to define the interface and rename and/or refactor some of the methods below. 295 """ 296 297 @tracer.start_as_current_span("SynapseStorage::__init__") 298 def __init__( 299 self, 300 token: Optional[str] = None, # optional parameter retrieved from browser cookie 301 access_token: Optional[str] = None, 302 project_scope: Optional[list] = None, 303 synapse_cache_path: Optional[str] = None, 304 perform_query: Optional[bool] = True, 305 columns: Optional[list] = None, 306 where_clauses: Optional[list] = None, 307 ) -> None: 308 """Initializes a SynapseStorage object. 309 310 Args: 311 token (Optional[str], optional): 312 Optional token parameter as found in browser cookie upon login to synapse. 313 Defaults to None. 314 access_token (Optional[list], optional): 315 Optional access token (personal or oauth). 316 Defaults to None. 317 project_scope (Optional[list], optional): Defaults to None. 318 synapse_cache_path (Optional[str], optional): 319 Location of synapse cache. 320 Defaults to None. 321 TODO: 322 Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how `query_fileview` is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands. 323 """ 324 self.syn = self.login(synapse_cache_path, access_token) 325 current_span = trace.get_current_span() 326 if current_span.is_recording(): 327 current_span.set_attribute("user.id", self.syn.credentials.owner_id) 328 self.project_scope = project_scope 329 self.storageFileview = CONFIG.synapse_master_fileview_id 330 self.manifest = CONFIG.synapse_manifest_basename 331 self.root_synapse_cache = self.syn.cache.cache_root_dir 332 self.synapse_entity_tracker = SynapseEntityTracker() 333 if perform_query: 334 self.query_fileview(columns=columns, where_clauses=where_clauses) 335 336 # TODO: When moving this over to a regular cron-job the following logic should be 337 # out of `manifest_download`: 338 # if "SECRETS_MANAGER_SECRETS" in os.environ: 339 # temporary_manifest_storage = "/var/tmp/temp_manifest_download" 340 # cleanup_temporary_storage(temporary_manifest_storage, time_delta_seconds=3600) 341 @tracer.start_as_current_span("SynapseStorage::_purge_synapse_cache") 342 def _purge_synapse_cache( 343 self, maximum_storage_allowed_cache_gb: int = 1, minute_buffer: int = 15 344 ) -> None: 345 """ 346 Purge synapse cache if it exceeds a certain size. Default to 1GB. 347 Args: 348 maximum_storage_allowed_cache_gb (int): the maximum storage allowed 349 before purging cache. Default is 1 GB. 350 minute_buffer (int): All files created this amount of time or older will be deleted 351 """ 352 # try clearing the cache 353 # scan a directory and check size of files 354 if os.path.exists(self.root_synapse_cache): 355 maximum_storage_allowed_cache_bytes = maximum_storage_allowed_cache_gb * ( 356 1024**3 357 ) 358 nbytes = get_dir_size(self.root_synapse_cache) 359 dir_size_bytes = check_synapse_cache_size(directory=self.root_synapse_cache) 360 # if 1 GB has already been taken, purge cache before 15 min 361 if dir_size_bytes >= maximum_storage_allowed_cache_bytes: 362 num_of_deleted_files = clear_synapse_cache( 363 self.syn.cache, minutes=minute_buffer 364 ) 365 logger.info( 366 f"{num_of_deleted_files} files have been deleted from {self.root_synapse_cache}" 367 ) 368 else: 369 # on AWS, OS takes around 14-17% of our ephemeral storage (20GiB) 370 # instead of guessing how much space that we left, print out .synapseCache here 371 logger.info(f"the total size of .synapseCache is: {nbytes} bytes") 372 373 @tracer.start_as_current_span("SynapseStorage::query_fileview") 374 def query_fileview( 375 self, 376 columns: Optional[list] = None, 377 where_clauses: Optional[list] = None, 378 force_requery: Optional[bool] = False, 379 ) -> None: 380 """ 381 Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute. 382 Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes. 383 Args: 384 columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns. 385 where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None. 386 force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False. 387 """ 388 self._purge_synapse_cache() 389 390 # Initialize to assume that the new fileview query will be different from what may already be stored. Initializes to True because generally one will not have already been performed 391 self.new_query_different = True 392 393 # If a query has already been performed, store the query 394 previous_query_built = hasattr(self, "fileview_query") 395 if previous_query_built: 396 previous_query = self.fileview_query 397 398 # Build a query with the current given parameters and check to see if it is different from the previous 399 self._build_query(columns=columns, where_clauses=where_clauses) 400 if previous_query_built: 401 self.new_query_different = self.fileview_query != previous_query 402 403 # Only perform the query if it is different from the previous query or we are forcing new results to be retrieved 404 if self.new_query_different or force_requery: 405 try: 406 self.storageFileviewTable = self.syn.tableQuery( 407 query=self.fileview_query, 408 ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) 409 except SynapseHTTPError as exc: 410 exception_text = str(exc) 411 if "Unknown column path" in exception_text: 412 raise ValueError( 413 "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation." 414 ) 415 elif "Unknown column" in exception_text: 416 missing_column = exception_text.split("Unknown column ")[-1] 417 raise ValueError( 418 f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview." 419 ) 420 else: 421 raise AccessCredentialsError(self.storageFileview) 422 423 @staticmethod 424 def build_clause_from_dataset_id( 425 dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None 426 ) -> str: 427 """ 428 Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized. 429 Args: 430 dataset_id: Synapse ID of a dataset that should be used to limit the query 431 dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query 432 Returns: 433 clause for the query or an empty string if no dataset ID is provided 434 """ 435 # Calling this method without specifying synIDs will complete but will not scope the view 436 if (not dataset_id) and (not dataset_folder_list): 437 return "" 438 439 # This will be used to gather files under a dataset recursively with a fileview query instead of walking 440 if dataset_folder_list: 441 search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list) 442 return f"parentId IN ({search_folders})" 443 444 # `dataset_id` should be provided when all files are stored directly under the dataset folder 445 return f"parentId='{dataset_id}'" 446 447 def _build_query( 448 self, columns: Optional[list] = None, where_clauses: Optional[list] = None 449 ): 450 """ 451 Method to build a query for Synapse FileViews 452 Args: 453 columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns. 454 where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None. 455 self.storageFileview (str): Synapse FileView ID 456 self.project_scope (Optional[list], optional): List of project IDs to be used to scope the query. Defaults to None. 457 Gets added to where_clauses, more included for backwards compatability and as a more user friendly way of subsetting the view in a simple way. 458 """ 459 if columns is None: 460 columns = [] 461 if where_clauses is None: 462 where_clauses = [] 463 464 if self.project_scope: 465 project_scope_clause = f"projectId IN {tuple(self.project_scope + [''])}" 466 where_clauses.append(project_scope_clause) 467 468 if where_clauses: 469 where_clauses = " AND ".join(where_clauses) 470 where_clauses = f"WHERE {where_clauses} ;" 471 else: 472 where_clauses = ";" 473 474 if columns: 475 columns = ",".join(columns) 476 else: 477 columns = "*" 478 479 self.fileview_query = ( 480 f"SELECT {columns} FROM {self.storageFileview} {where_clauses}" 481 ) 482 483 return 484 485 @staticmethod 486 @tracer.start_as_current_span("SynapseStorage::login") 487 def login( 488 synapse_cache_path: Optional[str] = None, 489 access_token: Optional[str] = None, 490 ) -> synapseclient.Synapse: 491 """Login to Synapse 492 493 Args: 494 access_token (Optional[str], optional): A synapse access token. Defaults to None. 495 synapse_cache_path (Optional[str]): location of synapse cache 496 497 Raises: 498 ValueError: If unable to loging with access token 499 500 Returns: 501 synapseclient.Synapse: A Synapse object that is logged in 502 """ 503 # If no token is provided, try retrieving access token from environment 504 if not access_token: 505 access_token = os.getenv("SYNAPSE_ACCESS_TOKEN") 506 507 # login using a token 508 if access_token: 509 try: 510 syn = synapseclient.Synapse( 511 cache_root_dir=synapse_cache_path, 512 debug=False, 513 skip_checks=True, 514 cache_client=False, 515 ) 516 syn.login(authToken=access_token, silent=True) 517 current_span = trace.get_current_span() 518 if current_span.is_recording(): 519 current_span.set_attribute("user.id", syn.credentials.owner_id) 520 except SynapseHTTPError as exc: 521 raise ValueError( 522 "No access to resources. Please make sure that your token is correct" 523 ) from exc 524 else: 525 # login using synapse credentials provided by user in .synapseConfig (default) file 526 syn = synapseclient.Synapse( 527 configPath=CONFIG.synapse_configuration_path, 528 cache_root_dir=synapse_cache_path, 529 debug=False, 530 skip_checks=True, 531 cache_client=False, 532 ) 533 syn.login(silent=True) 534 current_span = trace.get_current_span() 535 if current_span.is_recording(): 536 current_span.set_attribute("user.id", syn.credentials.owner_id) 537 return syn 538 539 def missing_entity_handler(method): 540 def wrapper(*args, **kwargs): 541 try: 542 return method(*args, **kwargs) 543 except SynapseHTTPError as ex: 544 str_message = str(ex).replace("\n", "") 545 if "trash" in str_message or "does not exist" in str_message: 546 logging.warning(str_message) 547 return None 548 else: 549 raise ex 550 551 return wrapper 552 553 def async_missing_entity_handler(method): 554 """Decorator to handle missing entities in async methods.""" 555 556 async def wrapper(*args: Any, **kwargs: Any) -> Any: 557 try: 558 return await method(*args, **kwargs) 559 except SynapseHTTPError as ex: 560 str_message = str(ex).replace("\n", "") 561 if "trash" in str_message or "does not exist" in str_message: 562 logging.warning(str_message) 563 return None 564 else: 565 raise ex 566 567 return wrapper 568 569 def getStorageFileviewTable(self): 570 """Returns the storageFileviewTable obtained during initialization.""" 571 return self.storageFileviewTable 572 573 def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]: 574 """Gets the paginated results of the REST call to Synapse to check what projects the current user has access to. 575 576 Args: 577 currentUserId: synapse id for the user whose projects we want to get. 578 579 Returns: 580 A dictionary with a next page token and the results. 581 """ 582 all_results = self.syn.restGET( 583 "/projects/user/{principalId}".format(principalId=currentUserId) 584 ) 585 586 while ( 587 "nextPageToken" in all_results 588 ): # iterate over next page token in results while there is any 589 results_token = self.syn.restGET( 590 "/projects/user/{principalId}?nextPageToken={nextPageToken}".format( 591 principalId=currentUserId, 592 nextPageToken=all_results["nextPageToken"], 593 ) 594 ) 595 all_results["results"].extend(results_token["results"]) 596 597 if "nextPageToken" in results_token: 598 all_results["nextPageToken"] = results_token["nextPageToken"] 599 else: 600 del all_results["nextPageToken"] 601 602 return all_results 603 604 @tracer.start_as_current_span("SynapseStorage::getStorageProjects") 605 def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]: 606 """Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute. 607 608 Returns: 609 A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName). 610 """ 611 612 # get the set of all storage Synapse project accessible for this pipeline 613 storageProjects = self.storageFileviewTable["projectId"].unique() 614 615 # get the set of storage Synapse project accessible for this user 616 # get a list of projects from Synapse 617 current_user_project_headers = self.synapse_entity_tracker.get_project_headers( 618 current_user_id=self.syn.credentials.owner_id, syn=self.syn 619 ) 620 project_id_to_name_dict = {} 621 current_user_projects = [] 622 for project_header in current_user_project_headers: 623 project_id_to_name_dict[project_header.get("id")] = project_header.get( 624 "name" 625 ) 626 current_user_projects.append(project_header.get("id")) 627 628 # find set of user projects that are also in this pipeline's storage projects set 629 storageProjects = list(set(storageProjects) & set(current_user_projects)) 630 631 # Limit projects to scope if specified 632 if project_scope: 633 storageProjects = list(set(storageProjects) & set(project_scope)) 634 635 if not storageProjects: 636 raise Warning( 637 f"There are no projects that the user has access to that match the criteria of the specified project scope: {project_scope}" 638 ) 639 640 # prepare a return list of project IDs and names 641 projects = [] 642 for projectId in storageProjects: 643 project_name_from_project_header = project_id_to_name_dict.get(projectId) 644 projects.append((projectId, project_name_from_project_header)) 645 646 sorted_projects_list = sorted(projects, key=lambda tup: tup[0]) 647 648 return sorted_projects_list 649 650 @tracer.start_as_current_span("SynapseStorage::getStorageDatasetsInProject") 651 def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]: 652 """Gets all datasets in folder under a given storage project that the current user has access to. 653 654 Args: 655 projectId: synapse ID of a storage project. 656 657 Returns: 658 A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName). 659 None: If the projectId cannot be found on Synapse. 660 """ 661 662 # select all folders and fetch their names from within the storage project; 663 # if folder content type is defined, only select folders that contain datasets 664 if "contentType" in self.storageFileviewTable.columns: 665 foldersTable = self.storageFileviewTable[ 666 (self.storageFileviewTable["contentType"] == "dataset") 667 & (self.storageFileviewTable["projectId"] == projectId) 668 ] 669 else: 670 foldersTable = self.storageFileviewTable[ 671 (self.storageFileviewTable["type"] == "folder") 672 & (self.storageFileviewTable["parentId"] == projectId) 673 ] 674 675 # get an array of tuples (folderId, folderName) 676 # some folders are part of datasets; others contain datasets 677 # each dataset parent is the project; folders part of a dataset have another folder as a parent 678 # to get folders if and only if they contain datasets for each folder 679 # check if folder's parent is the project; if so that folder contains a dataset, 680 # unless the folder list has already been filtered to dataset folders based on contentType attribute above 681 682 datasetList = [] 683 folderProperties = ["id", "name"] 684 for folder in list( 685 foldersTable[folderProperties].itertuples(index=False, name=None) 686 ): 687 datasetList.append(folder) 688 689 sorted_dataset_list = sorted(datasetList, key=lambda tup: tup[0]) 690 691 return sorted_dataset_list 692 693 @tracer.start_as_current_span("SynapseStorage::getFilesInStorageDataset") 694 def getFilesInStorageDataset( 695 self, datasetId: str, fileNames: List = None, fullpath: bool = True 696 ) -> List[Tuple[str, str]]: 697 """Gets all files (excluding manifest files) in a given dataset folder. 698 699 Args: 700 datasetId: synapse ID of a storage dataset. 701 fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g. 702 metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present. 703 fullpath: if True return the full path as part of this filename; otherwise return just base filename 704 705 Returns: 706 A list of files; the list consists of tuples (fileId, fileName). 707 708 Raises: 709 ValueError: Dataset ID not found. 710 """ 711 file_list = [] 712 713 # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view 714 if self.storageFileviewTable.empty: 715 raise ValueError( 716 f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again." 717 ) 718 719 child_path = self.storageFileviewTable.loc[ 720 self.storageFileviewTable["parentId"] == datasetId, "path" 721 ] 722 if child_path.empty: 723 raise LookupError( 724 f"Dataset {datasetId} could not be found in fileview {self.storageFileview}." 725 ) 726 child_path = child_path.iloc[0] 727 728 # Get the dataset path by eliminating the child's portion of the path to account for nested datasets 729 parent = child_path.split("/")[:-1] 730 parent = "/".join(parent) 731 732 # Format dataset path to be used in table query 733 dataset_path = f"'{parent}/%'" 734 735 # When querying, only include files to exclude entity files and subdirectories 736 where_clauses = [f"path like {dataset_path}", "type='file'"] 737 738 # Requery the fileview to specifically get the files in the given dataset 739 self.query_fileview(columns=["id", "path"], where_clauses=where_clauses) 740 741 # Exclude manifest files 742 non_manifest_files = self.storageFileviewTable.loc[ 743 ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"), 744 :, 745 ] 746 747 # Remove all files that are not in the list of fileNames 748 if fileNames: 749 filename_regex = "|".join(fileNames) 750 751 matching_files = non_manifest_files["path"].str.contains( 752 filename_regex, case=False, regex=True 753 ) 754 755 non_manifest_files = non_manifest_files.loc[matching_files, :] 756 757 # Truncate path if necessary 758 if not fullpath: 759 non_manifest_files.path = non_manifest_files.path.apply(os.path.basename) 760 761 # Return list of files as expected by other methods 762 file_list = list(non_manifest_files.itertuples(index=False, name=None)) 763 764 return file_list 765 766 def _get_manifest_id(self, manifest: pd.DataFrame) -> str: 767 """If both censored and uncensored manifests are present, return uncensored manifest; if only one manifest is present, return manifest id of that manifest; if more than two manifests are present, return the manifest id of the first one. 768 Args: 769 manifest: a dataframe contains name and id of manifests in a given asset view 770 771 Return: 772 manifest_syn_id: id of a given censored or uncensored manifest 773 """ 774 censored_regex = re.compile(".*censored.*") 775 censored = manifest["name"].str.contains(censored_regex) 776 if any(censored): 777 # Try to use uncensored manifest first 778 not_censored = ~censored 779 if any(not_censored): 780 manifest_syn_id = manifest[not_censored]["id"].iloc[0] 781 # if only censored manifests are available, just use the first censored manifest 782 else: 783 manifest_syn_id = manifest["id"].iloc[0] 784 785 # otherwise, use the first (implied only) version that exists 786 else: 787 manifest_syn_id = manifest["id"].iloc[0] 788 789 return manifest_syn_id 790 791 @tracer.start_as_current_span("SynapseStorage::getDatasetManifest") 792 def getDatasetManifest( 793 self, 794 datasetId: str, 795 downloadFile: bool = False, 796 newManifestName: str = "", 797 use_temporary_folder: bool = True, 798 ) -> Union[str, File]: 799 """Gets the manifest associated with a given dataset. 800 801 Args: 802 datasetId: synapse ID of a storage dataset. 803 downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not. 804 newManifestName: new name of a manifest that gets downloaded 805 use_temporary_folder: boolean argument indicating if a temporary folder 806 should be used to store the manifest file. This is useful when running 807 this code as an API server where multiple requests could be made at the 808 same time. This is set to False when the code is being used from the 809 CLI. Defaults to True. 810 811 Returns: 812 manifest_syn_id (String): Synapse ID of exisiting manifest file. 813 manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True. 814 "" (String): No pre-exisiting manifest in dataset. 815 """ 816 manifest_data = "" 817 818 # get a list of files containing the manifest for this dataset (if any) 819 all_files = self.storageFileviewTable 820 821 # construct regex based on manifest basename in the config 822 manifest_re = re.compile(os.path.basename(self.manifest) + ".*.[tc]sv") 823 824 # search manifest based on given manifest basename regex above 825 # and return a dataframe containing name and id of manifests in a given asset view 826 manifest = all_files[ 827 (all_files["name"].str.contains(manifest_re, regex=True)) 828 & (all_files["parentId"] == datasetId) 829 ] 830 831 manifest = manifest[["id", "name"]] 832 833 # if there is no pre-exisiting manifest in the specified dataset 834 if manifest.empty: 835 logger.warning( 836 f"Could not find a manifest that fits basename {self.manifest} in asset view and dataset {datasetId}" 837 ) 838 return "" 839 840 # if there is an exisiting manifest 841 else: 842 manifest_syn_id = self._get_manifest_id(manifest) 843 if downloadFile: 844 md = ManifestDownload( 845 self.syn, 846 manifest_id=manifest_syn_id, 847 synapse_entity_tracker=self.synapse_entity_tracker, 848 ) 849 manifest_data = md.download_manifest( 850 newManifestName=newManifestName, 851 manifest_df=manifest, 852 use_temporary_folder=use_temporary_folder, 853 ) 854 # TO DO: revisit how downstream code handle manifest_data. If the downstream code would break when manifest_data is an empty string, 855 # then we should catch the error here without returning an empty string. 856 if not manifest_data: 857 logger.debug( 858 f"No manifest data returned. Please check if you have successfully downloaded manifest: {manifest_syn_id}" 859 ) 860 return manifest_data 861 return manifest_syn_id 862 863 def getDataTypeFromManifest(self, manifestId: str): 864 """Fetch a manifest and return data types of all columns 865 Args: 866 manifestId: synapse ID of a manifest 867 """ 868 # get manifest file path 869 manifest_entity = self.synapse_entity_tracker.get( 870 synapse_id=manifestId, syn=self.syn, download_file=True 871 ) 872 manifest_filepath = manifest_entity.path 873 874 # load manifest dataframe 875 manifest = load_df( 876 manifest_filepath, 877 preserve_raw_input=False, 878 data_model=False, 879 ) 880 881 # convert the dataFrame to use best possible dtypes. 882 manifest_new = manifest.convert_dtypes() 883 884 # get data types of columns 885 result = manifest_new.dtypes.to_frame("dtypes").reset_index() 886 887 # return the result as a dictionary 888 result_dict = result.set_index("index")["dtypes"].astype(str).to_dict() 889 890 return result_dict 891 892 def _get_files_metadata_from_dataset( 893 self, datasetId: str, only_new_files: bool, manifest: pd.DataFrame = None 894 ) -> Optional[dict]: 895 """retrieve file ids under a particular datasetId 896 897 Args: 898 datasetId (str): a dataset id 899 only_new_files (bool): if only adding new files that are not already exist 900 manifest (pd.DataFrame): metadata manifest dataframe. Default to None. 901 902 Returns: 903 a dictionary that contains filename and entityid under a given datasetId or None if there is nothing under a given dataset id are not available 904 """ 905 dataset_files = self.getFilesInStorageDataset(datasetId) 906 if dataset_files: 907 dataset_file_names_id_dict = self._get_file_entityIds( 908 dataset_files, only_new_files=only_new_files, manifest=manifest 909 ) 910 return dataset_file_names_id_dict 911 else: 912 return None 913 914 def add_entity_id_and_filename( 915 self, datasetId: str, manifest: pd.DataFrame 916 ) -> pd.DataFrame: 917 """add entityid and filename column to an existing manifest assuming entityId column is not already present 918 919 Args: 920 datasetId (str): dataset syn id 921 manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty 922 923 Returns: 924 pd.DataFrame: returns a pandas dataframe 925 """ 926 # get file names and entity ids of a given dataset 927 dataset_files_dict = self._get_files_metadata_from_dataset( 928 datasetId, only_new_files=False 929 ) 930 931 if dataset_files_dict: 932 # turn manifest dataframe back to a dictionary for operation 933 manifest_dict = manifest.to_dict("list") 934 935 # update Filename column 936 # add entityId column to the end 937 manifest_dict.update(dataset_files_dict) 938 939 # if the component column exists in existing manifest, fill up that column 940 if "Component" in manifest_dict.keys(): 941 manifest_dict["Component"] = manifest_dict["Component"] * max( 942 1, len(manifest_dict["Filename"]) 943 ) 944 945 # turn dictionary back to a dataframe 946 manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient="index") 947 manifest_df_updated = manifest_df_index.transpose() 948 949 # fill na with empty string 950 manifest_df_updated = manifest_df_updated.fillna("") 951 952 # drop index 953 manifest_df_updated = manifest_df_updated.reset_index(drop=True) 954 955 return manifest_df_updated 956 else: 957 return manifest 958 959 def fill_in_entity_id_filename( 960 self, datasetId: str, manifest: pd.DataFrame 961 ) -> Tuple[List, pd.DataFrame]: 962 """fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present. 963 964 Args: 965 datasetId (str): dataset syn id 966 manifest (pd.DataFrame): existing manifest dataframe. 967 968 Returns: 969 Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe 970 """ 971 # get dataset file names and entity id as a list of tuple 972 dataset_files = self.getFilesInStorageDataset(datasetId) 973 974 # update manifest with additional filenames, if any 975 # note that if there is an existing manifest and there are files in the dataset 976 # the columns Filename and entityId are assumed to be present in manifest schema 977 # TODO: use idiomatic panda syntax 978 if not dataset_files: 979 manifest = manifest.fillna("") 980 return dataset_files, manifest 981 982 all_files = self._get_file_entityIds( 983 dataset_files=dataset_files, only_new_files=False, manifest=manifest 984 ) 985 new_files = self._get_file_entityIds( 986 dataset_files=dataset_files, only_new_files=True, manifest=manifest 987 ) 988 989 all_files = pd.DataFrame(all_files) 990 new_files = pd.DataFrame(new_files) 991 992 # update manifest so that it contains new dataset files 993 manifest = ( 994 pd.concat([manifest, new_files], sort=False) 995 .reset_index() 996 .drop("index", axis=1) 997 ) 998 999 # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata 1000 manifest_reindex = manifest.set_index("entityId") 1001 all_files_reindex = all_files.set_index("entityId") 1002 all_files_reindex_like_manifest = all_files_reindex.reindex_like( 1003 manifest_reindex 1004 ) 1005 1006 # Check if individual file paths in manifest and from synapse match 1007 file_paths_match = ( 1008 manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"] 1009 ) 1010 1011 # If all the paths do not match, update the manifest with the filepaths from synapse 1012 if not file_paths_match.all(): 1013 manifest_reindex.loc[ 1014 ~file_paths_match, "Filename" 1015 ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"] 1016 1017 # reformat manifest for further use 1018 manifest = manifest_reindex.reset_index() 1019 entityIdCol = manifest.pop("entityId") 1020 manifest.insert(len(manifest.columns), "entityId", entityIdCol) 1021 1022 manifest = manifest.fillna("") 1023 return dataset_files, manifest 1024 1025 @tracer.start_as_current_span("SynapseStorage::updateDatasetManifestFiles") 1026 def updateDatasetManifestFiles( 1027 self, dmge: DataModelGraphExplorer, datasetId: str, store: bool = True 1028 ) -> Union[Tuple[str, pd.DataFrame], None]: 1029 """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any. 1030 1031 Args: 1032 dmge: DataModelGraphExplorer Instance 1033 datasetId: synapse ID of a storage dataset. 1034 store: if set to True store updated manifest in asset store; if set to False 1035 return a Pandas dataframe containing updated manifest but do not store to asset store 1036 1037 1038 Returns: 1039 Synapse ID of updated manifest and Pandas dataframe containing the updated manifest. 1040 If there is no existing manifest or if the manifest does not have an entityId column, return None 1041 """ 1042 1043 # get existing manifest Synapse ID 1044 manifest_id = self.getDatasetManifest(datasetId) 1045 1046 # if there is no manifest return None 1047 if not manifest_id: 1048 return None 1049 1050 manifest_entity = self.synapse_entity_tracker.get( 1051 synapse_id=manifest_id, syn=self.syn, download_file=True 1052 ) 1053 manifest_filepath = manifest_entity.path 1054 manifest = load_df(manifest_filepath) 1055 1056 # If the manifest does not have an entityId column, trigger a new manifest to be generated 1057 if "entityId" not in manifest.columns: 1058 return None 1059 1060 manifest_is_file_based = "Filename" in manifest.columns 1061 1062 if manifest_is_file_based: 1063 # update manifest with additional filenames, if any 1064 # note that if there is an existing manifest and there are files in the dataset 1065 # the columns Filename and entityId are assumed to be present in manifest schema 1066 # TODO: use idiomatic panda syntax 1067 dataset_files, manifest = self.fill_in_entity_id_filename( 1068 datasetId, manifest 1069 ) 1070 if dataset_files: 1071 # update the manifest file, so that it contains the relevant entity IDs 1072 if store: 1073 manifest.to_csv(manifest_filepath, index=False) 1074 1075 # store manifest and update associated metadata with manifest on Synapse 1076 manifest_id = self.associateMetadataWithFiles( 1077 dmge, manifest_filepath, datasetId 1078 ) 1079 1080 return manifest_id, manifest 1081 1082 def _get_file_entityIds( 1083 self, 1084 dataset_files: List, 1085 only_new_files: bool = False, 1086 manifest: pd.DataFrame = None, 1087 ): 1088 """ 1089 Get a dictionary of files in a dataset. Either files that are not in the current manifest or all files 1090 1091 Args: 1092 manifest: metadata manifest 1093 dataset_file: List of all files in a dataset 1094 only_new_files: boolean to control whether only new files are returned or all files in the dataset 1095 Returns: 1096 files: dictionary of file names and entityIDs, with scope as specified by `only_new_files` 1097 """ 1098 files = {"Filename": [], "entityId": []} 1099 1100 if only_new_files: 1101 if manifest is None: 1102 raise UnboundLocalError( 1103 "No manifest was passed in, a manifest is required when `only_new_files` is True." 1104 ) 1105 1106 if "entityId" not in manifest.columns: 1107 raise ValueError( 1108 "The manifest in your dataset and/or top level folder must contain the 'entityId' column. " 1109 "Please generate an empty manifest without annotations, manually add annotations to the " 1110 "appropriate files in the manifest, and then try again." 1111 ) 1112 1113 # find new files (that are not in the current manifest) if any 1114 for file_id, file_name in dataset_files: 1115 if not file_id in manifest["entityId"].values: 1116 files["Filename"].append(file_name) 1117 files["entityId"].append(file_id) 1118 else: 1119 # get all files 1120 for file_id, file_name in dataset_files: 1121 files["Filename"].append(file_name) 1122 files["entityId"].append(file_id) 1123 1124 return files 1125 1126 @tracer.start_as_current_span("SynapseStorage::getProjectManifests") 1127 def getProjectManifests( 1128 self, projectId: str 1129 ) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]: 1130 """Gets all metadata manifest files across all datasets in a specified project. 1131 1132 Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest 1133 as a list of tuples, one for each manifest: 1134 [ 1135 ( 1136 (datasetId, dataName), 1137 (manifestId, manifestName), 1138 (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema 1139 ), 1140 ... 1141 ] 1142 1143 TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface 1144 """ 1145 component = None 1146 entity = None 1147 manifests = [] 1148 1149 datasets = self.getStorageDatasetsInProject(projectId) 1150 1151 for datasetId, datasetName in datasets: 1152 # encode information about the manifest in a simple list (so that R clients can unpack it) 1153 # eventually can serialize differently 1154 1155 # Get synID of manifest for a dataset 1156 manifestId = self.getDatasetManifest(datasetId) 1157 1158 # If a manifest exists, get the annotations for it, else return base 'manifest' tuple 1159 if manifestId: 1160 annotations = self.getFileAnnotations(manifestId) 1161 1162 # If manifest has annotations specifying component, use that 1163 if annotations and "Component" in annotations: 1164 component = annotations["Component"] 1165 entity = self.synapse_entity_tracker.get( 1166 synapse_id=manifestId, syn=self.syn, download_file=False 1167 ) 1168 manifest_name = entity["properties"]["name"] 1169 1170 # otherwise download the manifest and parse for information 1171 elif not annotations or "Component" not in annotations: 1172 logging.debug( 1173 f"No component annotations have been found for manifest {manifestId}. " 1174 "The manifest will be downloaded and parsed instead. " 1175 "For increased speed, add component annotations to manifest." 1176 ) 1177 1178 manifest_info = self.getDatasetManifest( 1179 datasetId, downloadFile=True 1180 ) 1181 manifest_name = manifest_info["properties"].get("name", "") 1182 1183 if not manifest_name: 1184 logger.error(f"Failed to download manifests from {datasetId}") 1185 1186 manifest_path = manifest_info["path"] 1187 1188 manifest_df = load_df(manifest_path) 1189 1190 # Get component from component column if it exists 1191 if ( 1192 "Component" in manifest_df 1193 and not manifest_df["Component"].empty 1194 ): 1195 list(set(manifest_df["Component"])) 1196 component = list(set(manifest_df["Component"])) 1197 1198 # Added to address issues raised during DCA testing 1199 if "" in component: 1200 component.remove("") 1201 1202 if len(component) == 1: 1203 component = component[0] 1204 elif len(component) > 1: 1205 logging.warning( 1206 f"Manifest {manifestId} is composed of multiple components. Schematic does not support mulit-component manifests at this time." 1207 "Behavior of manifests with multiple components is undefined" 1208 ) 1209 else: 1210 manifest_name = "" 1211 component = None 1212 if component: 1213 manifest = ( 1214 (datasetId, datasetName), 1215 (manifestId, manifest_name), 1216 (component, component), 1217 ) 1218 elif manifestId: 1219 logging.debug( 1220 f"Manifest {manifestId} does not have an associated Component" 1221 ) 1222 manifest = ( 1223 (datasetId, datasetName), 1224 (manifestId, manifest_name), 1225 ("", ""), 1226 ) 1227 else: 1228 manifest = ( 1229 (datasetId, datasetName), 1230 ("", ""), 1231 ("", ""), 1232 ) 1233 1234 if manifest: 1235 manifests.append(manifest) 1236 1237 return manifests 1238 1239 def upload_project_manifests_to_synapse( 1240 self, dmge: DataModelGraphExplorer, projectId: str 1241 ) -> List[str]: 1242 """Upload all metadata manifest files across all datasets in a specified project as tables in Synapse. 1243 1244 Returns: String of all the manifest_table_ids of all the manifests that have been loaded. 1245 """ 1246 1247 manifests = [] 1248 manifest_loaded = [] 1249 datasets = self.getStorageDatasetsInProject(projectId) 1250 1251 for datasetId, datasetName in datasets: 1252 # encode information about the manifest in a simple list (so that R clients can unpack it) 1253 # eventually can serialize differently 1254 1255 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1256 1257 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1258 if manifest_info: 1259 manifest_id = manifest_info["properties"]["id"] 1260 manifest_name = manifest_info["properties"]["name"] 1261 manifest_path = manifest_info["path"] 1262 manifest_df = load_df(manifest_path) 1263 manifest_table_id = uploadDB( 1264 dmge=dmge, 1265 manifest=manifest, 1266 datasetId=datasetId, 1267 table_name=datasetName, 1268 ) 1269 manifest_loaded.append(datasetName) 1270 return manifest_loaded 1271 1272 def upload_annotated_project_manifests_to_synapse( 1273 self, projectId: str, path_to_json_ld: str, dry_run: bool = False 1274 ) -> List[str]: 1275 """ 1276 Purpose: 1277 For all manifests in a project, upload them as a table and add annotations manifest csv. 1278 Assumes the manifest is already present as a CSV in a dataset in the project. 1279 1280 """ 1281 # Instantiate DataModelParser 1282 data_model_parser = DataModelParser(path_to_data_model=path_to_json_ld) 1283 # Parse Model 1284 parsed_data_model = data_model_parser.parse_model() 1285 1286 # Instantiate DataModelGraph 1287 data_model_grapher = DataModelGraph(parsed_data_model) 1288 1289 # Generate graph 1290 graph_data_model = data_model_grapher.generate_data_model_graph() 1291 1292 # Instantiate DataModelGraphExplorer 1293 dmge = DataModelGraphExplorer(graph_data_model) 1294 1295 manifests = [] 1296 manifest_loaded = [] 1297 datasets = self.getStorageDatasetsInProject(projectId) 1298 for datasetId, datasetName in datasets: 1299 # encode information about the manifest in a simple list (so that R clients can unpack it) 1300 # eventually can serialize differently 1301 1302 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1303 manifests.append(manifest) 1304 1305 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1306 1307 if manifest_info: 1308 manifest_id = manifest_info["properties"]["id"] 1309 manifest_name = manifest_info["properties"]["name"] 1310 manifest_path = manifest_info["path"] 1311 manifest = ( 1312 (datasetId, datasetName), 1313 (manifest_id, manifest_name), 1314 ("", ""), 1315 ) 1316 if not dry_run: 1317 self.associateMetadataWithFiles( 1318 dmge, manifest_path, datasetId, manifest_record_type="table" 1319 ) 1320 manifest_loaded.append(manifest) 1321 1322 return manifests, manifest_loaded 1323 1324 def move_entities_to_new_project( 1325 self, 1326 projectId: str, 1327 newProjectId: str, 1328 returnEntities: bool = False, 1329 dry_run: bool = False, 1330 ): 1331 """ 1332 For each manifest csv in a project, look for all the entitiy ids that are associated. 1333 Look up the entitiy in the files, move the entity to new project. 1334 """ 1335 1336 manifests = [] 1337 manifest_loaded = [] 1338 datasets = self.getStorageDatasetsInProject(projectId) 1339 if datasets: 1340 for datasetId, datasetName in datasets: 1341 # encode information about the manifest in a simple list (so that R clients can unpack it) 1342 # eventually can serialize differently 1343 1344 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1345 manifests.append(manifest) 1346 1347 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1348 if manifest_info: 1349 manifest_id = manifest_info["properties"]["id"] 1350 manifest_name = manifest_info["properties"]["name"] 1351 manifest_path = manifest_info["path"] 1352 manifest_df = load_df(manifest_path) 1353 1354 manifest = ( 1355 (datasetId, datasetName), 1356 (manifest_id, manifest_name), 1357 ("", ""), 1358 ) 1359 manifest_loaded.append(manifest) 1360 1361 annotation_entities = self.storageFileviewTable[ 1362 (self.storageFileviewTable["id"].isin(manifest_df["entityId"])) 1363 & (self.storageFileviewTable["type"] == "folder") 1364 ]["id"] 1365 1366 if returnEntities: 1367 for entityId in annotation_entities: 1368 if not dry_run: 1369 moved_entity = self.syn.move(entityId, datasetId) 1370 self.synapse_entity_tracker.add( 1371 synapse_id=moved_entity.id, entity=moved_entity 1372 ) 1373 else: 1374 logging.info( 1375 f"{entityId} will be moved to folder {datasetId}." 1376 ) 1377 else: 1378 # generate project folder 1379 archive_project_folder = Folder( 1380 projectId + "_archive", parent=newProjectId 1381 ) 1382 archive_project_folder = self.syn.store(archive_project_folder) 1383 self.synapse_entity_tracker.add( 1384 synapse_id=archive_project_folder.id, 1385 entity=archive_project_folder, 1386 ) 1387 1388 # generate dataset folder 1389 dataset_archive_folder = Folder( 1390 "_".join([datasetId, datasetName, "archive"]), 1391 parent=archive_project_folder.id, 1392 ) 1393 dataset_archive_folder = self.syn.store(dataset_archive_folder) 1394 self.synapse_entity_tracker.add( 1395 synapse_id=dataset_archive_folder.id, 1396 entity=dataset_archive_folder, 1397 ) 1398 1399 for entityId in annotation_entities: 1400 # move entities to folder 1401 if not dry_run: 1402 moved_entity = self.syn.move( 1403 entityId, dataset_archive_folder.id 1404 ) 1405 self.synapse_entity_tracker.add( 1406 synapse_id=moved_entity.id, entity=moved_entity 1407 ) 1408 else: 1409 logging.info( 1410 f"{entityId} will be moved to folder {dataset_archive_folder.id}." 1411 ) 1412 else: 1413 raise LookupError( 1414 f"No datasets were found in the specified project: {projectId}. Re-check specified master_fileview in CONFIG and retry." 1415 ) 1416 return manifests, manifest_loaded 1417 1418 @tracer.start_as_current_span("SynapseStorage::get_synapse_table") 1419 def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable]: 1420 """Download synapse table as a pd dataframe; return table schema and etags as results too 1421 1422 Args: 1423 synapse_id: synapse ID of the table to query 1424 """ 1425 1426 results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id)) 1427 df = results.asDataFrame( 1428 rowIdAndVersionInIndex=False, 1429 na_values=STR_NA_VALUES_FILTERED, 1430 keep_default_na=False, 1431 ) 1432 1433 return df, results 1434 1435 @missing_entity_handler 1436 @tracer.start_as_current_span("SynapseStorage::uploadDB") 1437 def uploadDB( 1438 self, 1439 dmge: DataModelGraphExplorer, 1440 manifest: pd.DataFrame, 1441 datasetId: str, 1442 table_name: str, 1443 restrict: bool = False, 1444 table_manipulation: str = "replace", 1445 table_column_names: str = "class_label", 1446 ): 1447 """ 1448 Method to upload a database to an asset store. In synapse, this will upload a metadata table 1449 1450 Args: 1451 dmge: DataModelGraphExplorer object 1452 manifest: pd.Df manifest to upload 1453 datasetId: synID of the dataset for the manifest 1454 table_name: name of the table to be uploaded 1455 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 1456 existingTableId: str of the synId of the existing table, if one already exists 1457 table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert) 1458 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 1459 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 1460 display label formatting. 1461 Returns: 1462 manifest_table_id: synID of the uploaded table 1463 manifest: the original manifset 1464 table_manifest: manifest formatted appropriately for the table 1465 1466 """ 1467 1468 col_schema, table_manifest = self.formatDB( 1469 dmge=dmge, manifest=manifest, table_column_names=table_column_names 1470 ) 1471 1472 manifest_table_id = self.buildDB( 1473 datasetId, 1474 table_name, 1475 col_schema, 1476 table_manifest, 1477 table_manipulation, 1478 dmge, 1479 restrict, 1480 ) 1481 1482 return manifest_table_id, manifest, table_manifest 1483 1484 @tracer.start_as_current_span("SynapseStorage::formatDB") 1485 def formatDB(self, dmge, manifest, table_column_names): 1486 """ 1487 Method to format a manifest appropriatly for upload as table 1488 1489 Args: 1490 dmge: DataModelGraphExplorer object 1491 manifest: pd.Df manifest to upload 1492 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 1493 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 1494 display label formatting. 1495 Returns: 1496 col_schema: schema for table columns: type, size, etc 1497 table_manifest: formatted manifest 1498 1499 """ 1500 # Rename the manifest columns to display names to match fileview 1501 1502 blacklist_chars = ["(", ")", ".", " ", "-"] 1503 manifest_columns = manifest.columns.tolist() 1504 1505 table_manifest = deepcopy(manifest) 1506 1507 if table_column_names == "display_name": 1508 cols = table_manifest.columns 1509 1510 elif table_column_names == "display_label": 1511 cols = [ 1512 str(col).translate({ord(x): "" for x in blacklist_chars}) 1513 for col in manifest_columns 1514 ] 1515 1516 elif table_column_names == "class_label": 1517 cols = [ 1518 get_class_label_from_display_name(str(col)).translate( 1519 {ord(x): "" for x in blacklist_chars} 1520 ) 1521 for col in manifest_columns 1522 ] 1523 else: 1524 ValueError( 1525 f"The provided table_column_name: {table_column_names} is not valid, please resubmit with an allowed value only." 1526 ) 1527 1528 cols = list(map(lambda x: x.replace("EntityId", "entityId"), cols)) 1529 1530 # Reset column names in table manifest 1531 table_manifest.columns = cols 1532 1533 # move entity id to end of df 1534 entity_col = table_manifest.pop("entityId") 1535 table_manifest.insert(len(table_manifest.columns), "entityId", entity_col) 1536 1537 # Get the column schema 1538 col_schema = as_table_columns(table_manifest) 1539 1540 # Set Id column length to 64 (for some reason not being auto set.) 1541 for i, col in enumerate(col_schema): 1542 if col["name"].lower() == "id": 1543 col_schema[i]["maximumSize"] = 64 1544 1545 return col_schema, table_manifest 1546 1547 @tracer.start_as_current_span("SynapseStorage::buildDB") 1548 def buildDB( 1549 self, 1550 datasetId: str, 1551 table_name: str, 1552 col_schema: List, 1553 table_manifest: pd.DataFrame, 1554 table_manipulation: str, 1555 dmge: DataModelGraphExplorer, 1556 restrict: bool = False, 1557 ): 1558 """ 1559 Method to construct the table appropriately: create new table, replace existing, or upsert new into existing 1560 Calls TableOperations class to execute 1561 1562 Args: 1563 datasetId: synID of the dataset for the manifest 1564 table_name: name of the table to be uploaded 1565 col_schema: schema for table columns: type, size, etc from `formatDB` 1566 table_manifest: formatted manifest that can be uploaded as a table 1567 table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert) 1568 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 1569 1570 Returns: 1571 manifest_table_id: synID of the uploaded table 1572 1573 """ 1574 table_parent_id = self.getDatasetProject(datasetId=datasetId) 1575 existing_table_id = self.syn.findEntityId( 1576 name=table_name, parent=table_parent_id 1577 ) 1578 1579 tableOps = TableOperations( 1580 synStore=self, 1581 tableToLoad=table_manifest, 1582 tableName=table_name, 1583 datasetId=datasetId, 1584 existingTableId=existing_table_id, 1585 restrict=restrict, 1586 synapse_entity_tracker=self.synapse_entity_tracker, 1587 ) 1588 1589 if not table_manipulation or existing_table_id is None: 1590 manifest_table_id = tableOps.createTable( 1591 columnTypeDict=col_schema, 1592 specifySchema=True, 1593 ) 1594 elif existing_table_id is not None: 1595 if table_manipulation.lower() == "replace": 1596 manifest_table_id = tableOps.replaceTable( 1597 specifySchema=True, 1598 columnTypeDict=col_schema, 1599 ) 1600 elif table_manipulation.lower() == "upsert": 1601 manifest_table_id = tableOps.upsertTable( 1602 dmge=dmge, 1603 ) 1604 elif table_manipulation.lower() == "update": 1605 manifest_table_id = tableOps.updateTable() 1606 1607 if table_manipulation and table_manipulation.lower() == "upsert": 1608 table_entity = self.synapse_entity_tracker.get( 1609 synapse_id=existing_table_id or manifest_table_id, 1610 syn=self.syn, 1611 download_file=False, 1612 ) 1613 annos = OldAnnotations( 1614 id=table_entity.id, 1615 etag=table_entity.etag, 1616 values=table_entity.annotations, 1617 ) 1618 annos["primary_key"] = table_manifest["Component"][0] + "_id" 1619 annos = self.syn.set_annotations(annos) 1620 table_entity.etag = annos.etag 1621 table_entity.annotations = annos 1622 1623 return manifest_table_id 1624 1625 @tracer.start_as_current_span("SynapseStorage::upload_manifest_file") 1626 def upload_manifest_file( 1627 self, 1628 manifest, 1629 metadataManifestPath, 1630 datasetId, 1631 restrict_manifest, 1632 component_name="", 1633 ): 1634 # Update manifest to have the new entityId column 1635 manifest.to_csv(metadataManifestPath, index=False) 1636 1637 # store manifest to Synapse as a CSV 1638 # update file name 1639 file_name_full = metadataManifestPath.split("/")[-1] 1640 file_extension = file_name_full.split(".")[-1] 1641 1642 # Differentiate "censored" and "uncensored" manifest 1643 if "censored" in file_name_full: 1644 file_name_new = ( 1645 os.path.basename(CONFIG.synapse_manifest_basename) 1646 + "_" 1647 + component_name 1648 + "_censored" 1649 + "." 1650 + file_extension 1651 ) 1652 else: 1653 file_name_new = ( 1654 os.path.basename(CONFIG.synapse_manifest_basename) 1655 + "_" 1656 + component_name 1657 + "." 1658 + file_extension 1659 ) 1660 1661 manifest_synapse_file = None 1662 try: 1663 # Rename the file to file_name_new then revert 1664 # This is to maintain the original file name in-case other code is 1665 # expecting that the file exists with the original name 1666 original_file_path = metadataManifestPath 1667 new_file_path = os.path.join( 1668 os.path.dirname(metadataManifestPath), file_name_new 1669 ) 1670 os.rename(original_file_path, new_file_path) 1671 1672 manifest_synapse_file = self._store_file_for_manifest_upload( 1673 new_file_path=new_file_path, 1674 dataset_id=datasetId, 1675 existing_file_name=file_name_full, 1676 file_name_new=file_name_new, 1677 restrict_manifest=restrict_manifest, 1678 ) 1679 manifest_synapse_file_id = manifest_synapse_file.id 1680 1681 finally: 1682 # Revert the file name back to the original 1683 os.rename(new_file_path, original_file_path) 1684 1685 if manifest_synapse_file: 1686 manifest_synapse_file.path = original_file_path 1687 1688 return manifest_synapse_file_id 1689 1690 def _store_file_for_manifest_upload( 1691 self, 1692 new_file_path: str, 1693 dataset_id: str, 1694 existing_file_name: str, 1695 file_name_new: str, 1696 restrict_manifest: bool, 1697 ) -> File: 1698 """Handles a create or update of a manifest file that is going to be uploaded. 1699 If we already have a copy of the Entity in memory we will update that instance, 1700 otherwise create a new File instance to be created in Synapse. Once stored 1701 this will add the file to the `synapse_entity_tracker` for future reference. 1702 1703 Args: 1704 new_file_path (str): The path to the new manifest file 1705 dataset_id (str): The Synapse ID of the dataset the manifest is associated with 1706 existing_file_name (str): The name of the existing file 1707 file_name_new (str): The name of the new file 1708 restrict_manifest (bool): Whether the manifest should be restricted 1709 1710 Returns: 1711 File: The stored manifest file 1712 """ 1713 local_tracked_file_instance = ( 1714 self.synapse_entity_tracker.search_local_by_parent_and_name( 1715 name=existing_file_name, parent_id=dataset_id 1716 ) 1717 or self.synapse_entity_tracker.search_local_by_parent_and_name( 1718 name=file_name_new, parent_id=dataset_id 1719 ) 1720 ) 1721 1722 if local_tracked_file_instance: 1723 local_tracked_file_instance.path = new_file_path 1724 local_tracked_file_instance.description = ( 1725 "Manifest for dataset " + dataset_id 1726 ) 1727 manifest_synapse_file = local_tracked_file_instance 1728 else: 1729 manifest_synapse_file = File( 1730 path=new_file_path, 1731 description="Manifest for dataset " + dataset_id, 1732 parent=dataset_id, 1733 name=file_name_new, 1734 ) 1735 1736 manifest_synapse_file = self.syn.store( 1737 manifest_synapse_file, isRestricted=restrict_manifest 1738 ) 1739 1740 self.synapse_entity_tracker.add( 1741 synapse_id=manifest_synapse_file.id, entity=manifest_synapse_file 1742 ) 1743 return manifest_synapse_file 1744 1745 async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]: 1746 """get annotations asynchronously 1747 1748 Args: 1749 synapse_id (str): synapse id of the entity that the annotation belongs 1750 1751 Returns: 1752 Dict[str, Any]: The requested entity bundle matching 1753 <https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html> 1754 """ 1755 return await get_entity_id_bundle2( 1756 entity_id=synapse_id, 1757 request={"includeAnnotations": True}, 1758 synapse_client=self.syn, 1759 ) 1760 1761 async def store_async_annotation(self, annotation_dict: dict) -> Annotations: 1762 """store annotation in an async way 1763 1764 Args: 1765 annotation_dict (dict): annotation in a dictionary format 1766 1767 Returns: 1768 Annotations: The stored annotations. 1769 """ 1770 annotation_data = Annotations.from_dict( 1771 synapse_annotations=annotation_dict["annotations"]["annotations"] 1772 ) 1773 annotation_class = Annotations( 1774 annotations=annotation_data, 1775 etag=annotation_dict["annotations"]["etag"], 1776 id=annotation_dict["annotations"]["id"], 1777 ) 1778 annotation_storage_result = await annotation_class.store_async( 1779 synapse_client=self.syn 1780 ) 1781 local_entity = self.synapse_entity_tracker.get( 1782 synapse_id=annotation_dict["annotations"]["id"], 1783 syn=self.syn, 1784 download_file=False, 1785 retrieve_if_not_present=False, 1786 ) 1787 if local_entity: 1788 local_entity.etag = annotation_storage_result.etag 1789 local_entity.annotations = annotation_storage_result 1790 return annotation_storage_result 1791 1792 def process_row_annotations( 1793 self, 1794 dmge: DataModelGraphExplorer, 1795 metadata_syn: Dict[str, Any], 1796 hide_blanks: bool, 1797 csv_list_regex: str, 1798 annos: Dict[str, Any], 1799 annotation_keys: str, 1800 ) -> Dict[str, Any]: 1801 """Processes metadata annotations based on the logic below: 1802 1. Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is: 1803 An empty or whitespace-only string. 1804 A NaN value (if the annotation is a float). 1805 if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key. 1806 if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key. 1807 1808 2. If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name". 1809 Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key. 1810 1811 3. For any other conditions, assigns the original value of anno_v to the annotation key (anno_k). 1812 1813 4. Returns the updated annotations dictionary. 1814 1815 Args: 1816 dmge (DataModelGraphExplorer): data model graph explorer 1817 metadata_syn (dict): metadata used for Synapse storage 1818 hideBlanks (bool): if true, does not upload annotation keys with blank values. 1819 csv_list_regex (str): Regex to match with comma separated list 1820 annos (Dict[str, Any]): dictionary of annotation returned from synapse 1821 annotation_keys (str): display_label/class_label 1822 1823 Returns: 1824 Dict[str, Any]: annotations as a dictionary 1825 1826 ```mermaid 1827 flowchart TD 1828 A[Start] --> C{Is anno_v empty, whitespace, or NaN?} 1829 C -- Yes --> D{Is hide_blanks True?} 1830 D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing] 1831 D -- No --> F[Assign empty string to annotation key] 1832 C -- No --> G{Is anno_v a string?} 1833 G -- No --> H[Assign original value of anno_v to annotation key] 1834 G -- Yes --> I{Does anno_v match csv_list_regex?} 1835 I -- Yes --> J[Get validation rule of anno_k] 1836 J --> K{Does the validation rule contain 'list'} 1837 K -- Yes --> L[Split anno_v by commas and assign as list] 1838 I -- No --> H 1839 K -- No --> H 1840 ``` 1841 """ 1842 for anno_k, anno_v in metadata_syn.items(): 1843 # Remove keys with nan or empty string values or string that only contains white space from dict of annotations to be uploaded 1844 # if present on current data annotation 1845 if hide_blanks and ( 1846 (isinstance(anno_v, str) and anno_v.strip() == "") 1847 or (isinstance(anno_v, float) and np.isnan(anno_v)) 1848 ): 1849 annos["annotations"]["annotations"].pop(anno_k) if anno_k in annos[ 1850 "annotations" 1851 ]["annotations"].keys() else annos["annotations"]["annotations"] 1852 continue 1853 1854 # Otherwise save annotation as approrpriate 1855 if isinstance(anno_v, float) and np.isnan(anno_v): 1856 annos["annotations"]["annotations"][anno_k] = "" 1857 continue 1858 1859 # Handle strings that match the csv_list_regex and pass the validation rule 1860 if isinstance(anno_v, str) and re.fullmatch(csv_list_regex, anno_v): 1861 # Use a dictionary to dynamically choose the argument 1862 param = ( 1863 {"node_display_name": anno_k} 1864 if annotation_keys == "display_label" 1865 else {"node_label": anno_k} 1866 ) 1867 node_validation_rules = dmge.get_node_validation_rules(**param) 1868 1869 if rule_in_rule_list("list", node_validation_rules): 1870 annos["annotations"]["annotations"][anno_k] = anno_v.split(",") 1871 continue 1872 # default: assign the original value 1873 annos["annotations"]["annotations"][anno_k] = anno_v 1874 1875 return annos 1876 1877 @async_missing_entity_handler 1878 async def format_row_annotations( 1879 self, 1880 dmge: DataModelGraphExplorer, 1881 row: pd.Series, 1882 entityId: str, 1883 hideBlanks: bool, 1884 annotation_keys: str, 1885 ) -> Union[None, Dict[str, Any]]: 1886 """Format row annotations 1887 1888 Args: 1889 dmge (DataModelGraphExplorer): data moodel graph explorer object 1890 row (pd.Series): row of the manifest 1891 entityId (str): entity id of the manifest 1892 hideBlanks (bool): when true, does not upload annotation keys with blank values. When false, upload Annotation keys with empty string values 1893 annotation_keys (str): display_label/class_label 1894 1895 Returns: 1896 Union[None, Dict[str,]]: if entity id is in trash can, return None. Otherwise, return the annotations 1897 """ 1898 # prepare metadata for Synapse storage (resolve display name into a name that Synapse annotations support (e.g no spaces, parenthesis) 1899 # note: the removal of special characters, will apply only to annotation keys; we are not altering the manifest 1900 # this could create a divergence between manifest column and annotations. this should be ok for most use cases. 1901 # columns with special characters are outside of the schema 1902 metadataSyn = {} 1903 blacklist_chars = ["(", ")", ".", " ", "-"] 1904 1905 for k, v in row.to_dict().items(): 1906 if annotation_keys == "display_label": 1907 keySyn = str(k).translate({ord(x): "" for x in blacklist_chars}) 1908 elif annotation_keys == "class_label": 1909 keySyn = get_class_label_from_display_name(str(k)).translate( 1910 {ord(x): "" for x in blacklist_chars} 1911 ) 1912 1913 # Skip `Filename` and `ETag` columns when setting annotations 1914 if keySyn in ["Filename", "ETag", "eTag"]: 1915 continue 1916 1917 # truncate annotation values to 500 characters if the 1918 # size of values is greater than equal to 500 characters 1919 # add an explicit [truncatedByDataCuratorApp] message at the end 1920 # of every truncated message to indicate that the cell value 1921 # has been truncated 1922 if isinstance(v, str) and len(v) >= 500: 1923 v = v[0:472] + "[truncatedByDataCuratorApp]" 1924 1925 metadataSyn[keySyn] = v 1926 1927 # This will first check if the entity is already in memory, and if so, that 1928 # instance is used. Unfortunately, the expected return format needs to match 1929 # the Synapse API, so we need to convert the annotations to the expected format. 1930 entity = self.synapse_entity_tracker.get( 1931 synapse_id=entityId, 1932 syn=self.syn, 1933 download_file=False, 1934 retrieve_if_not_present=False, 1935 ) 1936 if entity is not None: 1937 synapse_annotations = _convert_to_annotations_list( 1938 annotations=entity.annotations 1939 ) 1940 annos = { 1941 "annotations": { 1942 "id": entity.id, 1943 "etag": entity.etag, 1944 "annotations": synapse_annotations, 1945 } 1946 } 1947 else: 1948 annos = await self.get_async_annotation(entityId) 1949 1950 # set annotation(s) for the various objects/items in a dataset on Synapse 1951 csv_list_regex = comma_separated_list_regex() 1952 1953 annos = self.process_row_annotations( 1954 dmge=dmge, 1955 metadata_syn=metadataSyn, 1956 hide_blanks=hideBlanks, 1957 csv_list_regex=csv_list_regex, 1958 annos=annos, 1959 annotation_keys=annotation_keys, 1960 ) 1961 1962 return annos 1963 1964 @missing_entity_handler 1965 @tracer.start_as_current_span("SynapseStorage::format_manifest_annotations") 1966 def format_manifest_annotations(self, manifest, manifest_synapse_id): 1967 """ 1968 Set annotations for the manifest (as a whole) so they can be applied to the manifest table or csv. 1969 For now just getting the Component. 1970 """ 1971 1972 entity = self.synapse_entity_tracker.get( 1973 synapse_id=manifest_synapse_id, syn=self.syn, download_file=False 1974 ) 1975 is_file = entity.concreteType.endswith(".FileEntity") 1976 is_table = entity.concreteType.endswith(".TableEntity") 1977 1978 if is_file: 1979 # Get file metadata 1980 metadata = self.getFileAnnotations(manifest_synapse_id) 1981 1982 # If there is a defined component add it to the metadata. 1983 if "Component" in manifest.columns: 1984 # Gather component information 1985 component = manifest["Component"].unique() 1986 1987 # Double check that only a single component is listed, else raise an error. 1988 try: 1989 len(component) == 1 1990 except ValueError as err: 1991 raise ValueError( 1992 f"Manifest has more than one component. Please check manifest and resubmit." 1993 ) from err 1994 1995 # Add component to metadata 1996 metadata["Component"] = component[0] 1997 1998 elif is_table: 1999 # Get table metadata 2000 metadata = self.getTableAnnotations(manifest_synapse_id) 2001 2002 # Get annotations 2003 annos = OldAnnotations( 2004 id=entity.id, etag=entity.etag, values=entity.annotations 2005 ) 2006 2007 # Add metadata to the annotations 2008 for annos_k, annos_v in metadata.items(): 2009 annos[annos_k] = annos_v 2010 2011 return annos 2012 2013 ''' 2014 def annotate_upload_manifest_table(self, manifest, datasetId, metadataManifestPath, 2015 useSchemaLabel: bool = True, hideBlanks: bool = False, restrict_manifest = False): 2016 """ 2017 Purpose: 2018 Works very similarly to associateMetadataWithFiles except takes in the manifest 2019 rather than the manifest path 2020 2021 """ 2022 2023 # Add uuid for table updates and fill. 2024 if not "Uuid" in manifest.columns: 2025 manifest["Uuid"] = '' 2026 2027 for idx,row in manifest.iterrows(): 2028 if not row["Uuid"]: 2029 gen_uuid = uuid.uuid4() 2030 row["Uuid"] = gen_uuid 2031 manifest.loc[idx, 'Uuid'] = gen_uuid 2032 2033 # add entityId as a column if not already there or 2034 # fill any blanks with an empty string. 2035 if not "entityId" in manifest.columns: 2036 manifest["entityId"] = "" 2037 else: 2038 manifest["entityId"].fillna("", inplace=True) 2039 2040 # get a DataModelGraphExplorer object to ensure schema attribute names used in manifest are translated to schema labels for synapse annotations 2041 dmge = DataModelGraphExplorer() 2042 2043 # Create table name here. 2044 if 'Component' in manifest.columns: 2045 table_name = manifest['Component'][0].lower() + '_synapse_storage_manifest_table' 2046 else: 2047 table_name = 'synapse_storage_manifest_table' 2048 2049 # Upload manifest as a table and get the SynID and manifest 2050 manifest_synapse_table_id, manifest, table_manifest = self.upload_format_manifest_table( 2051 dmge, manifest, datasetId, table_name, restrict = restrict_manifest, useSchemaLabel=useSchemaLabel,) 2052 2053 # Iterate over manifest rows, create Synapse entities and store corresponding entity IDs in manifest if needed 2054 # also set metadata for each synapse entity as Synapse annotations 2055 for idx, row in manifest.iterrows(): 2056 if not row["entityId"]: 2057 # If not using entityIds, fill with manifest_table_id so 2058 row["entityId"] = manifest_synapse_table_id 2059 entityId = '' 2060 else: 2061 # get the entity id corresponding to this row 2062 entityId = row["entityId"] 2063 2064 # Load manifest to synapse as a CSV File 2065 manifest_synapse_file_id = self.upload_manifest_file(manifest, metadataManifestPath, datasetId, restrict_manifest) 2066 2067 # Get annotations for the file manifest. 2068 manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_file_id) 2069 2070 self.syn.set_annotations(manifest_annotations) 2071 2072 logger.info("Associated manifest file with dataset on Synapse.") 2073 2074 # Update manifest Synapse table with new entity id column. 2075 self.make_synapse_table( 2076 table_to_load = table_manifest, 2077 dataset_id = datasetId, 2078 existingTableId = manifest_synapse_table_id, 2079 table_name = table_name, 2080 update_col = 'Uuid', 2081 specify_schema = False, 2082 ) 2083 2084 # Get annotations for the table manifest 2085 manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_table_id) 2086 self.syn.set_annotations(manifest_annotations) 2087 return manifest_synapse_table_id 2088 ''' 2089 2090 def _read_manifest(self, metadataManifestPath: str) -> pd.DataFrame: 2091 """Helper function to read in provided manifest as a pandas DataFrame for subsequent downstream processing. 2092 Args: 2093 metadataManifestPath (str): path where manifest is stored 2094 Returns: 2095 manifest(pd.DataFrame): Manifest loaded as a pandas dataframe 2096 Raises: 2097 FileNotFoundError: Manifest file does not exist at provided path. 2098 """ 2099 # read new manifest csv 2100 try: 2101 load_args = { 2102 "dtype": "string", 2103 } 2104 manifest = load_df( 2105 metadataManifestPath, 2106 preserve_raw_input=False, 2107 allow_na_values=False, 2108 **load_args, 2109 ) 2110 except FileNotFoundError as err: 2111 raise FileNotFoundError( 2112 f"No manifest file was found at this path: {metadataManifestPath}" 2113 ) from err 2114 return manifest 2115 2116 def _add_id_columns_to_manifest( 2117 self, manifest: pd.DataFrame, dmge: DataModelGraphExplorer 2118 ): 2119 """Helper function to add id and entityId columns to the manifest if they do not already exist, Fill id values per row. 2120 Args: 2121 Manifest loaded as a pd.Dataframe 2122 Returns (pd.DataFrame): 2123 Manifest df with new Id and EntityId columns (and UUID values) if they were not already present. 2124 """ 2125 2126 # Add Id for table updates and fill. 2127 if not col_in_dataframe("Id", manifest): 2128 # See if schema has `Uuid` column specified 2129 try: 2130 uuid_col_in_schema = dmge.is_class_in_schema( 2131 "Uuid" 2132 ) or dmge.is_class_in_schema("uuid") 2133 except KeyError: 2134 uuid_col_in_schema = False 2135 2136 # Rename `Uuid` column if it wasn't specified in the schema 2137 if col_in_dataframe("Uuid", manifest) and not uuid_col_in_schema: 2138 manifest.rename(columns={"Uuid": "Id"}, inplace=True) 2139 # If no `Uuid` column exists or it is specified in the schema, create a new `Id` column 2140 else: 2141 manifest["Id"] = "" 2142 2143 # Retrieve the ID column name (id, Id and ID) are treated the same. 2144 id_col_name = [col for col in manifest.columns if col.lower() == "id"][0] 2145 2146 # Check if values have been added to the Id coulumn, if not add a UUID so value in the row is not blank. 2147 for idx, row in manifest.iterrows(): 2148 if not row[id_col_name]: 2149 gen_uuid = str(uuid.uuid4()) 2150 row[id_col_name] = gen_uuid 2151 manifest.loc[idx, id_col_name] = gen_uuid 2152 2153 # add entityId as a column if not already there or 2154 # fill any blanks with an empty string. 2155 if not col_in_dataframe("entityId", manifest): 2156 manifest["entityId"] = "" 2157 else: 2158 manifest["entityId"].fillna("", inplace=True) 2159 2160 return manifest 2161 2162 def _generate_table_name(self, manifest): 2163 """Helper function to generate a table name for upload to synapse. 2164 2165 Args: 2166 Manifest loaded as a pd.Dataframe 2167 2168 Returns: 2169 table_name (str): Name of the table to load 2170 component_name (str): Name of the manifest component (if applicable) 2171 """ 2172 # Create table name here. 2173 if "Component" in manifest.columns: 2174 component_name = manifest["Component"][0].lower() 2175 table_name = component_name + "_synapse_storage_manifest_table" 2176 else: 2177 component_name = "" 2178 table_name = "synapse_storage_manifest_table" 2179 return table_name, component_name 2180 2181 def _create_entity_id(self, idx, row, manifest, datasetId): 2182 """Helper function to generate an entityId and add it to the appropriate row in the manifest. 2183 Args: 2184 row: current row of manifest being processed 2185 manifest (pd.DataFrame): loaded df containing user supplied data. 2186 datasetId (str): synapse ID of folder containing the dataset 2187 2188 Returns: 2189 manifest (pd.DataFrame): manifest with entityId added to the appropriate row 2190 entityId (str): Generated Entity Id. 2191 2192 """ 2193 rowEntity = Folder(str(uuid.uuid4()), parent=datasetId) 2194 rowEntity = self.syn.store(rowEntity) 2195 entityId = rowEntity["id"] 2196 self.synapse_entity_tracker.add(synapse_id=entityId, entity=rowEntity) 2197 row["entityId"] = entityId 2198 manifest.loc[idx, "entityId"] = entityId 2199 return manifest, entityId 2200 2201 async def _process_store_annos(self, requests: Set[asyncio.Task]) -> None: 2202 """Process annotations and store them on synapse asynchronously 2203 2204 Args: 2205 requests (Set[asyncio.Task]): a set of tasks of formatting annotations created by format_row_annotations function in previous step 2206 2207 Raises: 2208 RuntimeError: raise a run time error if a task failed to complete 2209 """ 2210 while requests: 2211 done_tasks, pending_tasks = await asyncio.wait( 2212 requests, return_when=asyncio.FIRST_COMPLETED 2213 ) 2214 requests = pending_tasks 2215 2216 for completed_task in done_tasks: 2217 try: 2218 annos = completed_task.result() 2219 2220 if isinstance(annos, Annotations): 2221 logger.info(f"Successfully stored annotations for {annos.id}") 2222 else: 2223 # store annotations if they are not None 2224 if annos: 2225 entity_id = annos["annotations"]["id"] 2226 logger.info( 2227 f"Obtained and processed annotations for {entity_id} entity" 2228 ) 2229 requests.add( 2230 asyncio.create_task( 2231 self.store_async_annotation(annotation_dict=annos) 2232 ) 2233 ) 2234 except Exception as e: 2235 raise RuntimeError(f"failed with { repr(e) }.") from e 2236 2237 @tracer.start_as_current_span("SynapseStorage::add_annotations_to_entities_files") 2238 async def add_annotations_to_entities_files( 2239 self, 2240 dmge, 2241 manifest, 2242 manifest_record_type: str, 2243 datasetId: str, 2244 hideBlanks: bool, 2245 manifest_synapse_table_id="", 2246 annotation_keys: str = "class_label", 2247 ): 2248 """ 2249 Depending on upload type add Ids to entityId row. Add anotations to connected 2250 files and folders. Despite the name of this function, it also applies to folders. 2251 2252 Args: 2253 dmge: DataModelGraphExplorer Object 2254 manifest (pd.DataFrame): loaded df containing user supplied data. 2255 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2256 datasetId (str): synapse ID of folder containing the dataset 2257 hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2258 manifest_synapse_table_id (str): Default is an empty string ''. 2259 annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display 2260 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2261 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2262 Returns: 2263 manifest (pd.DataFrame): modified to add entitiyId as appropriate 2264 2265 """ 2266 2267 # Expected behavior is to annotate files if `Filename` is present and if file_annotations_upload is set to True regardless of `-mrt` setting 2268 if "filename" in [col.lower() for col in manifest.columns]: 2269 # get current list of files and store as dataframe 2270 dataset_files = self.getFilesInStorageDataset(datasetId) 2271 files_and_entityIds = self._get_file_entityIds( 2272 dataset_files=dataset_files, only_new_files=False 2273 ) 2274 file_df = pd.DataFrame(files_and_entityIds) 2275 2276 # Merge dataframes to add entityIds 2277 manifest = manifest.merge( 2278 file_df, how="left", on="Filename", suffixes=["_x", None] 2279 ).drop("entityId_x", axis=1) 2280 2281 # Fill `entityId` for each row if missing and annotate entity as appropriate 2282 requests = set() 2283 for idx, row in manifest.iterrows(): 2284 if not row["entityId"] and ( 2285 manifest_record_type == "file_and_entities" 2286 or manifest_record_type == "table_file_and_entities" 2287 ): 2288 manifest, entityId = self._create_entity_id( 2289 idx, row, manifest, datasetId 2290 ) 2291 elif not row["entityId"] and manifest_record_type == "table_and_file": 2292 # If not using entityIds, fill with manifest_table_id so 2293 row["entityId"] = manifest_synapse_table_id 2294 manifest.loc[idx, "entityId"] = manifest_synapse_table_id 2295 entityId = "" 2296 # If the row is the manifest table, do not add annotations 2297 elif row["entityId"] == manifest_synapse_table_id: 2298 entityId = "" 2299 else: 2300 # get the file id of the file to annotate, collected in above step. 2301 entityId = row["entityId"] 2302 2303 # Adding annotations to connected files. 2304 if entityId: 2305 # Format annotations for Synapse 2306 annos_task = asyncio.create_task( 2307 self.format_row_annotations( 2308 dmge, row, entityId, hideBlanks, annotation_keys 2309 ) 2310 ) 2311 requests.add(annos_task) 2312 await self._process_store_annos(requests) 2313 return manifest 2314 2315 @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_table") 2316 def upload_manifest_as_table( 2317 self, 2318 dmge: DataModelGraphExplorer, 2319 manifest: pd.DataFrame, 2320 metadataManifestPath: str, 2321 datasetId: str, 2322 table_name: str, 2323 component_name: str, 2324 restrict: bool, 2325 manifest_record_type: str, 2326 hideBlanks: bool, 2327 table_manipulation: str, 2328 table_column_names: str, 2329 annotation_keys: str, 2330 file_annotations_upload: bool = True, 2331 ): 2332 """Upload manifest to Synapse as a table and csv. 2333 Args: 2334 dmge: DataModelGraphExplorer object 2335 manifest (pd.DataFrame): loaded df containing user supplied data. 2336 metadataManifestPath: path to csv containing a validated metadata manifest. 2337 datasetId (str): synapse ID of folder containing the dataset 2338 table_name (str): Generated to name the table being uploaded. 2339 component_name (str): Name of the component manifest that is currently being uploaded. 2340 restrict (bool): Flag for censored data. 2341 manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2342 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2343 table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2344 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2345 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2346 display label formatting. 2347 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2348 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2349 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2350 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2351 Return: 2352 manifest_synapse_file_id: SynID of manifest csv uploaded to synapse. 2353 """ 2354 # Upload manifest as a table, get the ID and updated manifest. 2355 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2356 dmge=dmge, 2357 manifest=manifest, 2358 datasetId=datasetId, 2359 table_name=table_name, 2360 restrict=restrict, 2361 table_manipulation=table_manipulation, 2362 table_column_names=table_column_names, 2363 ) 2364 2365 if file_annotations_upload: 2366 manifest = asyncio.run( 2367 self.add_annotations_to_entities_files( 2368 dmge, 2369 manifest, 2370 manifest_record_type, 2371 datasetId, 2372 hideBlanks, 2373 manifest_synapse_table_id, 2374 annotation_keys, 2375 ) 2376 ) 2377 # Load manifest to synapse as a CSV File 2378 manifest_synapse_file_id = self.upload_manifest_file( 2379 manifest=manifest, 2380 metadataManifestPath=metadataManifestPath, 2381 datasetId=datasetId, 2382 restrict_manifest=restrict, 2383 component_name=component_name, 2384 ) 2385 2386 # Set annotations for the file manifest. 2387 manifest_annotations = self.format_manifest_annotations( 2388 manifest=manifest, manifest_synapse_id=manifest_synapse_file_id 2389 ) 2390 annos = self.syn.set_annotations(annotations=manifest_annotations) 2391 manifest_entity = self.synapse_entity_tracker.get( 2392 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2393 ) 2394 manifest_entity.annotations = annos 2395 manifest_entity.etag = annos.etag 2396 2397 logger.info("Associated manifest file with dataset on Synapse.") 2398 2399 # Update manifest Synapse table with new entity id column. 2400 manifest_synapse_table_id, manifest, _ = self.uploadDB( 2401 dmge=dmge, 2402 manifest=manifest, 2403 datasetId=datasetId, 2404 table_name=table_name, 2405 restrict=restrict, 2406 table_manipulation="update", 2407 table_column_names=table_column_names, 2408 ) 2409 2410 # Set annotations for the table manifest 2411 manifest_annotations = self.format_manifest_annotations( 2412 manifest=manifest, manifest_synapse_id=manifest_synapse_table_id 2413 ) 2414 annotations_manifest_table = self.syn.set_annotations( 2415 annotations=manifest_annotations 2416 ) 2417 manifest_table_entity = self.synapse_entity_tracker.get( 2418 synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False 2419 ) 2420 manifest_table_entity.annotations = annotations_manifest_table 2421 manifest_table_entity.etag = annotations_manifest_table.etag 2422 2423 return manifest_synapse_file_id 2424 2425 @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_csv") 2426 def upload_manifest_as_csv( 2427 self, 2428 dmge, 2429 manifest, 2430 metadataManifestPath, 2431 datasetId, 2432 restrict, 2433 manifest_record_type, 2434 hideBlanks, 2435 component_name, 2436 annotation_keys: str, 2437 file_annotations_upload: bool = True, 2438 ): 2439 """Upload manifest to Synapse as a csv only. 2440 Args: 2441 dmge: DataModelGraphExplorer object 2442 manifest (pd.DataFrame): loaded df containing user supplied data. 2443 metadataManifestPath: path to csv containing a validated metadata manifest. 2444 datasetId (str): synapse ID of folder containing the dataset 2445 restrict (bool): Flag for censored data. 2446 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2447 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2448 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2449 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2450 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2451 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2452 Return: 2453 manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. 2454 """ 2455 if file_annotations_upload: 2456 manifest = asyncio.run( 2457 self.add_annotations_to_entities_files( 2458 dmge, 2459 manifest, 2460 manifest_record_type, 2461 datasetId, 2462 hideBlanks, 2463 annotation_keys=annotation_keys, 2464 ) 2465 ) 2466 2467 # Load manifest to synapse as a CSV File 2468 manifest_synapse_file_id = self.upload_manifest_file( 2469 manifest, 2470 metadataManifestPath, 2471 datasetId, 2472 restrict, 2473 component_name=component_name, 2474 ) 2475 2476 # Set annotations for the file manifest. 2477 manifest_annotations = self.format_manifest_annotations( 2478 manifest, manifest_synapse_file_id 2479 ) 2480 annos = self.syn.set_annotations(manifest_annotations) 2481 manifest_entity = self.synapse_entity_tracker.get( 2482 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2483 ) 2484 manifest_entity.annotations = annos 2485 manifest_entity.etag = annos.etag 2486 2487 logger.info("Associated manifest file with dataset on Synapse.") 2488 2489 return manifest_synapse_file_id 2490 2491 @tracer.start_as_current_span("SynapseStorage::upload_manifest_combo") 2492 def upload_manifest_combo( 2493 self, 2494 dmge, 2495 manifest, 2496 metadataManifestPath, 2497 datasetId, 2498 table_name, 2499 component_name, 2500 restrict, 2501 manifest_record_type, 2502 hideBlanks, 2503 table_manipulation, 2504 table_column_names: str, 2505 annotation_keys: str, 2506 file_annotations_upload: bool = True, 2507 ): 2508 """Upload manifest to Synapse as a table and CSV with entities. 2509 Args: 2510 dmge: DataModelGraphExplorer object 2511 manifest (pd.DataFrame): loaded df containing user supplied data. 2512 metadataManifestPath: path to csv containing a validated metadata manifest. 2513 datasetId (str): synapse ID of folder containing the dataset 2514 table_name (str): Generated to name the table being uploaded. 2515 component_name (str): Name of the component manifest that is currently being uploaded. 2516 restrict (bool): Flag for censored data. 2517 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2518 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2519 table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2520 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2521 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2522 display label formatting. 2523 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2524 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2525 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2526 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2527 Return: 2528 manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. 2529 """ 2530 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2531 dmge=dmge, 2532 manifest=manifest, 2533 datasetId=datasetId, 2534 table_name=table_name, 2535 restrict=restrict, 2536 table_manipulation=table_manipulation, 2537 table_column_names=table_column_names, 2538 ) 2539 2540 if file_annotations_upload: 2541 manifest = asyncio.run( 2542 self.add_annotations_to_entities_files( 2543 dmge, 2544 manifest, 2545 manifest_record_type, 2546 datasetId, 2547 hideBlanks, 2548 manifest_synapse_table_id, 2549 annotation_keys=annotation_keys, 2550 ) 2551 ) 2552 2553 # Load manifest to synapse as a CSV File 2554 manifest_synapse_file_id = self.upload_manifest_file( 2555 manifest, metadataManifestPath, datasetId, restrict, component_name 2556 ) 2557 2558 # Set annotations for the file manifest. 2559 manifest_annotations = self.format_manifest_annotations( 2560 manifest, manifest_synapse_file_id 2561 ) 2562 file_manifest_annoations = self.syn.set_annotations(manifest_annotations) 2563 manifest_entity = self.synapse_entity_tracker.get( 2564 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2565 ) 2566 manifest_entity.annotations = file_manifest_annoations 2567 manifest_entity.etag = file_manifest_annoations.etag 2568 logger.info("Associated manifest file with dataset on Synapse.") 2569 2570 # Update manifest Synapse table with new entity id column. 2571 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2572 dmge=dmge, 2573 manifest=manifest, 2574 datasetId=datasetId, 2575 table_name=table_name, 2576 restrict=restrict, 2577 table_manipulation="update", 2578 table_column_names=table_column_names, 2579 ) 2580 2581 # Set annotations for the table manifest 2582 manifest_annotations = self.format_manifest_annotations( 2583 manifest, manifest_synapse_table_id 2584 ) 2585 table_manifest_annotations = self.syn.set_annotations(manifest_annotations) 2586 manifest_entity = self.synapse_entity_tracker.get( 2587 synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False 2588 ) 2589 manifest_entity.annotations = table_manifest_annotations 2590 manifest_entity.etag = table_manifest_annotations.etag 2591 return manifest_synapse_file_id 2592 2593 @tracer.start_as_current_span("SynapseStorage::associateMetadataWithFiles") 2594 def associateMetadataWithFiles( 2595 self, 2596 dmge: DataModelGraphExplorer, 2597 metadataManifestPath: str, 2598 datasetId: str, 2599 manifest_record_type: str = "table_file_and_entities", 2600 hideBlanks: bool = False, 2601 restrict_manifest=False, 2602 table_manipulation: str = "replace", 2603 table_column_names: str = "class_label", 2604 annotation_keys: str = "class_label", 2605 file_annotations_upload: bool = True, 2606 ) -> str: 2607 """Associate metadata with files in a storage dataset already on Synapse. 2608 Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file. 2609 2610 If this is a new manifest there could be no Synapse entities associated with the rows of this manifest 2611 this may be due to data type (e.g. clinical data) being tabular 2612 and not requiring files; to utilize uniform interfaces downstream 2613 (i.e. fileviews), a Synapse entity (a folder) is created for each row 2614 and an entity column is added to the manifest containing the resulting 2615 entity IDs; a table is also created at present as an additional interface 2616 for downstream query and interaction with the data. 2617 2618 Args: 2619 dmge: DataModelGraphExplorer Object 2620 metadataManifestPath: path to csv containing a validated metadata manifest. 2621 The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type. 2622 Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item. 2623 In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file. 2624 datasetId: synapse ID of folder containing the dataset 2625 manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination. 2626 hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2627 restrict_manifest (bool): Default is false. Flag for censored data. 2628 table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2629 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2630 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2631 display label formatting. 2632 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2633 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2634 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2635 Returns: 2636 manifest_synapse_file_id: SynID of manifest csv uploaded to synapse. 2637 """ 2638 # Read new manifest CSV: 2639 manifest = self._read_manifest(metadataManifestPath) 2640 manifest = self._add_id_columns_to_manifest(manifest, dmge) 2641 2642 table_name, component_name = self._generate_table_name(manifest) 2643 2644 # Upload manifest to synapse based on user input (manifest_record_type) 2645 if manifest_record_type == "file_only": 2646 manifest_synapse_file_id = self.upload_manifest_as_csv( 2647 dmge=dmge, 2648 manifest=manifest, 2649 metadataManifestPath=metadataManifestPath, 2650 datasetId=datasetId, 2651 restrict=restrict_manifest, 2652 hideBlanks=hideBlanks, 2653 manifest_record_type=manifest_record_type, 2654 component_name=component_name, 2655 annotation_keys=annotation_keys, 2656 file_annotations_upload=file_annotations_upload, 2657 ) 2658 elif manifest_record_type == "table_and_file": 2659 manifest_synapse_file_id = self.upload_manifest_as_table( 2660 dmge=dmge, 2661 manifest=manifest, 2662 metadataManifestPath=metadataManifestPath, 2663 datasetId=datasetId, 2664 table_name=table_name, 2665 component_name=component_name, 2666 restrict=restrict_manifest, 2667 hideBlanks=hideBlanks, 2668 manifest_record_type=manifest_record_type, 2669 table_manipulation=table_manipulation, 2670 table_column_names=table_column_names, 2671 annotation_keys=annotation_keys, 2672 file_annotations_upload=file_annotations_upload, 2673 ) 2674 elif manifest_record_type == "file_and_entities": 2675 manifest_synapse_file_id = self.upload_manifest_as_csv( 2676 dmge=dmge, 2677 manifest=manifest, 2678 metadataManifestPath=metadataManifestPath, 2679 datasetId=datasetId, 2680 restrict=restrict_manifest, 2681 hideBlanks=hideBlanks, 2682 manifest_record_type=manifest_record_type, 2683 component_name=component_name, 2684 annotation_keys=annotation_keys, 2685 file_annotations_upload=file_annotations_upload, 2686 ) 2687 elif manifest_record_type == "table_file_and_entities": 2688 manifest_synapse_file_id = self.upload_manifest_combo( 2689 dmge=dmge, 2690 manifest=manifest, 2691 metadataManifestPath=metadataManifestPath, 2692 datasetId=datasetId, 2693 table_name=table_name, 2694 component_name=component_name, 2695 restrict=restrict_manifest, 2696 hideBlanks=hideBlanks, 2697 manifest_record_type=manifest_record_type, 2698 table_manipulation=table_manipulation, 2699 table_column_names=table_column_names, 2700 annotation_keys=annotation_keys, 2701 file_annotations_upload=file_annotations_upload, 2702 ) 2703 else: 2704 raise ValueError("Please enter a valid manifest_record_type.") 2705 return manifest_synapse_file_id 2706 2707 def getTableAnnotations(self, table_id: str): 2708 """Generate dictionary of annotations for the given Synapse file. 2709 Synapse returns all custom annotations as lists since they 2710 can contain multiple values. In all cases, the values will 2711 be converted into strings and concatenated with ", ". 2712 2713 Args: 2714 fileId (str): Synapse ID for dataset file. 2715 2716 Returns: 2717 dict: Annotations as comma-separated strings. 2718 """ 2719 try: 2720 entity = self.synapse_entity_tracker.get( 2721 synapse_id=table_id, syn=self.syn, download_file=False 2722 ) 2723 is_table = entity.concreteType.endswith(".TableEntity") 2724 annotations_raw = entity.annotations 2725 except SynapseHTTPError: 2726 # If an error occurs with retrieving entity, skip it 2727 # This could be caused by a temporary file view that 2728 # was deleted since its ID was retrieved 2729 is_file, is_table = False, False 2730 2731 # Skip anything that isn't a file or folder 2732 if not (is_table): 2733 return None 2734 2735 annotations = self.getEntityAnnotations(table_id, entity, annotations_raw) 2736 2737 return annotations 2738 2739 def getFileAnnotations(self, fileId: str) -> Dict[str, str]: 2740 """Generate dictionary of annotations for the given Synapse file. 2741 Synapse returns all custom annotations as lists since they 2742 can contain multiple values. In all cases, the values will 2743 be converted into strings and concatenated with ", ". 2744 2745 Args: 2746 fileId (str): Synapse ID for dataset file. 2747 2748 Returns: 2749 dict: Annotations as comma-separated strings. 2750 """ 2751 2752 # Get entity metadata, including annotations 2753 try: 2754 entity = self.synapse_entity_tracker.get( 2755 synapse_id=fileId, syn=self.syn, download_file=False 2756 ) 2757 is_file = entity.concreteType.endswith(".FileEntity") 2758 is_folder = entity.concreteType.endswith(".Folder") 2759 annotations_raw = entity.annotations 2760 except SynapseHTTPError: 2761 # If an error occurs with retrieving entity, skip it 2762 # This could be caused by a temporary file view that 2763 # was deleted since its ID was retrieved 2764 is_file, is_folder = False, False 2765 2766 # Skip anything that isn't a file or folder 2767 if not (is_file or is_folder): 2768 return None 2769 2770 annotations = self.getEntityAnnotations(fileId, entity, annotations_raw) 2771 2772 return annotations 2773 2774 def getEntityAnnotations(self, fileId, entity, annotations_raw): 2775 # Extract annotations from their lists and stringify. For example: 2776 # {'YearofBirth': [1980], 'author': ['bruno', 'milen', 'sujay']} 2777 annotations = dict() 2778 for key, vals in annotations_raw.items(): 2779 if isinstance(vals, list) and len(vals) == 1: 2780 annotations[key] = str(vals[0]) 2781 else: 2782 annotations[key] = ", ".join(str(v) for v in vals) 2783 2784 # Add the file entity ID and eTag, which weren't lists 2785 assert fileId == entity.id, ( 2786 "For some reason, the Synapse ID in the response doesn't match" 2787 "the Synapse ID sent in the request (via synapseclient)." 2788 ) 2789 annotations["entityId"] = fileId 2790 annotations["eTag"] = entity.etag 2791 2792 return annotations 2793 2794 def getDatasetAnnotations( 2795 self, datasetId: str, fill_na: bool = True, force_batch: bool = False 2796 ) -> pd.DataFrame: 2797 """Generate table for annotations across all files in given dataset. 2798 2799 Args: 2800 datasetId (str): Synapse ID for dataset folder. 2801 fill_na (bool): Whether to replace missing values with 2802 blank strings. 2803 force_batch (bool): Whether to force the function to use 2804 the batch mode, which uses a file view to retrieve 2805 annotations for a given dataset. Default to False 2806 unless there are more than 50 files in the dataset. 2807 2808 Returns: 2809 pd.DataFrame: Table of annotations. 2810 """ 2811 # Get all files in given dataset 2812 dataset_files = self.getFilesInStorageDataset(datasetId) 2813 2814 # if there are no dataset files, there are no annotations 2815 # return None 2816 if not dataset_files: 2817 return pd.DataFrame() 2818 2819 dataset_files_map = dict(dataset_files) 2820 dataset_file_ids, _ = list(zip(*dataset_files)) 2821 2822 # Get annotations for each file from Step 1 2823 # Batch mode 2824 try_batch = len(dataset_files) >= 50 or force_batch 2825 if try_batch: 2826 try: 2827 logger.info("Trying batch mode for retrieving Synapse annotations") 2828 table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids) 2829 except (SynapseAuthenticationError, SynapseHTTPError, ValueError): 2830 logger.info( 2831 f"Unable to create a temporary file view bound to {datasetId}. " 2832 "Defaulting to slower iterative retrieval of annotations." 2833 ) 2834 # Default to the slower non-batch method 2835 logger.info("Batch mode failed (probably due to permission error)") 2836 try_batch = False 2837 2838 # Non-batch mode 2839 if not try_batch: 2840 logger.info("Using slower (non-batch) sequential mode") 2841 records = [self.getFileAnnotations(i) for i in dataset_file_ids] 2842 # Remove any annotations for non-file/folders (stored as None) 2843 records = filter(None, records) 2844 table = pd.DataFrame.from_records(records) 2845 2846 # Add filenames for the files that "survived" annotation retrieval 2847 filenames = [dataset_files_map[i] for i in table["entityId"]] 2848 2849 if "Filename" not in table.columns: 2850 table.insert(0, "Filename", filenames) 2851 2852 # Ensure that entityId and eTag are at the end 2853 entity_ids = table.pop("entityId") 2854 etags = table.pop("eTag") 2855 table.insert(len(table.columns), "entityId", entity_ids) 2856 table.insert(len(table.columns), "eTag", etags) 2857 2858 # Missing values are filled in with empty strings for Google Sheets 2859 if fill_na: 2860 table.fillna("", inplace=True) 2861 2862 # Force all values as strings 2863 return table.astype(str) 2864 2865 def raise_final_error(retry_state): 2866 return retry_state.outcome.result() 2867 2868 def checkIfinAssetView(self, syn_id) -> str: 2869 # get data in administrative fileview for this pipeline 2870 assetViewTable = self.getStorageFileviewTable() 2871 all_files = list(assetViewTable["id"]) 2872 if syn_id in all_files: 2873 return True 2874 else: 2875 return False 2876 2877 @tracer.start_as_current_span("SynapseStorage::getDatasetProject") 2878 @retry( 2879 stop=stop_after_attempt(5), 2880 wait=wait_chain( 2881 *[wait_fixed(10) for i in range(2)] 2882 + [wait_fixed(15) for i in range(2)] 2883 + [wait_fixed(20)] 2884 ), 2885 retry=retry_if_exception_type(LookupError), 2886 retry_error_callback=raise_final_error, 2887 ) 2888 def getDatasetProject(self, datasetId: str) -> str: 2889 """Get parent project for a given dataset ID. 2890 2891 Args: 2892 datasetId (str): Synapse entity ID (folder or project). 2893 2894 Raises: 2895 ValueError: Raised if Synapse ID cannot be retrieved 2896 by the user or if it doesn't appear in the file view. 2897 2898 Returns: 2899 str: The Synapse ID for the parent project. 2900 """ 2901 2902 # Subset main file view 2903 dataset_index = self.storageFileviewTable["id"] == datasetId 2904 dataset_row = self.storageFileviewTable[dataset_index] 2905 2906 # re-query if no datasets found 2907 if dataset_row.empty: 2908 sleep(5) 2909 self.query_fileview(force_requery=True) 2910 # Subset main file view 2911 dataset_index = self.storageFileviewTable["id"] == datasetId 2912 dataset_row = self.storageFileviewTable[dataset_index] 2913 2914 # Return `projectId` for given row if only one found 2915 if len(dataset_row) == 1: 2916 dataset_project = dataset_row["projectId"].values[0] 2917 return dataset_project 2918 2919 # Otherwise, check if already project itself 2920 try: 2921 syn_object = self.synapse_entity_tracker.get( 2922 synapse_id=datasetId, syn=self.syn, download_file=False 2923 ) 2924 if syn_object.properties["concreteType"].endswith("Project"): 2925 return datasetId 2926 except SynapseHTTPError: 2927 raise PermissionError( 2928 f"The given dataset ({datasetId}) isn't accessible with this " 2929 "user. This might be caused by a typo in the dataset Synapse ID." 2930 ) 2931 2932 # If not, then assume dataset not in file view 2933 raise LookupError( 2934 f"The given dataset ({datasetId}) doesn't appear in the " 2935 f"configured file view ({self.storageFileview}). This might " 2936 "mean that the file view's scope needs to be updated." 2937 ) 2938 2939 def getDatasetAnnotationsBatch( 2940 self, datasetId: str, dataset_file_ids: Sequence[str] = None 2941 ) -> pd.DataFrame: 2942 """Generate table for annotations across all files in given dataset. 2943 This function uses a temporary file view to generate a table 2944 instead of iteratively querying for individual entity annotations. 2945 This function is expected to run much faster than 2946 `self.getDatasetAnnotationsBatch` on large datasets. 2947 2948 Args: 2949 datasetId (str): Synapse ID for dataset folder. 2950 dataset_file_ids (Sequence[str]): List of Synapse IDs 2951 for dataset files/folders used to subset the table. 2952 2953 Returns: 2954 pd.DataFrame: Table of annotations. 2955 """ 2956 # Create data frame from annotations file view 2957 with DatasetFileView(datasetId, self.syn) as fileview: 2958 table = fileview.query() 2959 2960 if dataset_file_ids: 2961 table = table.loc[table.index.intersection(dataset_file_ids)] 2962 2963 table = table.reset_index(drop=True) 2964 2965 return table 2966 2967 def _get_table_schema_by_cname(self, table_schema): 2968 # assume no duplicate column names in the table 2969 table_schema_by_cname = {} 2970 2971 for col_record in table_schema: 2972 # TODO clean up dictionary for compactness (e.g. remove redundant 'name' key) 2973 table_schema_by_cname[col_record["name"]] = col_record 2974 2975 return table_schema_by_cname
Implementation of Storage interface for datasets/files stored on Synapse. Provides utilities to list files in a specific project; update files annotations, create fileviews, etc.
TODO: Need to define the interface and rename and/or refactor some of the methods below.
297 @tracer.start_as_current_span("SynapseStorage::__init__") 298 def __init__( 299 self, 300 token: Optional[str] = None, # optional parameter retrieved from browser cookie 301 access_token: Optional[str] = None, 302 project_scope: Optional[list] = None, 303 synapse_cache_path: Optional[str] = None, 304 perform_query: Optional[bool] = True, 305 columns: Optional[list] = None, 306 where_clauses: Optional[list] = None, 307 ) -> None: 308 """Initializes a SynapseStorage object. 309 310 Args: 311 token (Optional[str], optional): 312 Optional token parameter as found in browser cookie upon login to synapse. 313 Defaults to None. 314 access_token (Optional[list], optional): 315 Optional access token (personal or oauth). 316 Defaults to None. 317 project_scope (Optional[list], optional): Defaults to None. 318 synapse_cache_path (Optional[str], optional): 319 Location of synapse cache. 320 Defaults to None. 321 TODO: 322 Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how `query_fileview` is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands. 323 """ 324 self.syn = self.login(synapse_cache_path, access_token) 325 current_span = trace.get_current_span() 326 if current_span.is_recording(): 327 current_span.set_attribute("user.id", self.syn.credentials.owner_id) 328 self.project_scope = project_scope 329 self.storageFileview = CONFIG.synapse_master_fileview_id 330 self.manifest = CONFIG.synapse_manifest_basename 331 self.root_synapse_cache = self.syn.cache.cache_root_dir 332 self.synapse_entity_tracker = SynapseEntityTracker() 333 if perform_query: 334 self.query_fileview(columns=columns, where_clauses=where_clauses)
Initializes a SynapseStorage object.
Arguments:
- token (Optional[str], optional): Optional token parameter as found in browser cookie upon login to synapse. Defaults to None.
- access_token (Optional[list], optional): Optional access token (personal or oauth). Defaults to None.
- project_scope (Optional[list], optional): Defaults to None.
- synapse_cache_path (Optional[str], optional): Location of synapse cache. Defaults to None.
TODO:
Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how
query_fileview
is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands.
373 @tracer.start_as_current_span("SynapseStorage::query_fileview") 374 def query_fileview( 375 self, 376 columns: Optional[list] = None, 377 where_clauses: Optional[list] = None, 378 force_requery: Optional[bool] = False, 379 ) -> None: 380 """ 381 Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute. 382 Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes. 383 Args: 384 columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns. 385 where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None. 386 force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False. 387 """ 388 self._purge_synapse_cache() 389 390 # Initialize to assume that the new fileview query will be different from what may already be stored. Initializes to True because generally one will not have already been performed 391 self.new_query_different = True 392 393 # If a query has already been performed, store the query 394 previous_query_built = hasattr(self, "fileview_query") 395 if previous_query_built: 396 previous_query = self.fileview_query 397 398 # Build a query with the current given parameters and check to see if it is different from the previous 399 self._build_query(columns=columns, where_clauses=where_clauses) 400 if previous_query_built: 401 self.new_query_different = self.fileview_query != previous_query 402 403 # Only perform the query if it is different from the previous query or we are forcing new results to be retrieved 404 if self.new_query_different or force_requery: 405 try: 406 self.storageFileviewTable = self.syn.tableQuery( 407 query=self.fileview_query, 408 ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False) 409 except SynapseHTTPError as exc: 410 exception_text = str(exc) 411 if "Unknown column path" in exception_text: 412 raise ValueError( 413 "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation." 414 ) 415 elif "Unknown column" in exception_text: 416 missing_column = exception_text.split("Unknown column ")[-1] 417 raise ValueError( 418 f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview." 419 ) 420 else: 421 raise AccessCredentialsError(self.storageFileview)
Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute. Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes.
Arguments:
- columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
- where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
- force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False.
423 @staticmethod 424 def build_clause_from_dataset_id( 425 dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None 426 ) -> str: 427 """ 428 Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized. 429 Args: 430 dataset_id: Synapse ID of a dataset that should be used to limit the query 431 dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query 432 Returns: 433 clause for the query or an empty string if no dataset ID is provided 434 """ 435 # Calling this method without specifying synIDs will complete but will not scope the view 436 if (not dataset_id) and (not dataset_folder_list): 437 return "" 438 439 # This will be used to gather files under a dataset recursively with a fileview query instead of walking 440 if dataset_folder_list: 441 search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list) 442 return f"parentId IN ({search_folders})" 443 444 # `dataset_id` should be provided when all files are stored directly under the dataset folder 445 return f"parentId='{dataset_id}'"
Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized.
Arguments:
- dataset_id: Synapse ID of a dataset that should be used to limit the query
- dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query
Returns:
clause for the query or an empty string if no dataset ID is provided
485 @staticmethod 486 @tracer.start_as_current_span("SynapseStorage::login") 487 def login( 488 synapse_cache_path: Optional[str] = None, 489 access_token: Optional[str] = None, 490 ) -> synapseclient.Synapse: 491 """Login to Synapse 492 493 Args: 494 access_token (Optional[str], optional): A synapse access token. Defaults to None. 495 synapse_cache_path (Optional[str]): location of synapse cache 496 497 Raises: 498 ValueError: If unable to loging with access token 499 500 Returns: 501 synapseclient.Synapse: A Synapse object that is logged in 502 """ 503 # If no token is provided, try retrieving access token from environment 504 if not access_token: 505 access_token = os.getenv("SYNAPSE_ACCESS_TOKEN") 506 507 # login using a token 508 if access_token: 509 try: 510 syn = synapseclient.Synapse( 511 cache_root_dir=synapse_cache_path, 512 debug=False, 513 skip_checks=True, 514 cache_client=False, 515 ) 516 syn.login(authToken=access_token, silent=True) 517 current_span = trace.get_current_span() 518 if current_span.is_recording(): 519 current_span.set_attribute("user.id", syn.credentials.owner_id) 520 except SynapseHTTPError as exc: 521 raise ValueError( 522 "No access to resources. Please make sure that your token is correct" 523 ) from exc 524 else: 525 # login using synapse credentials provided by user in .synapseConfig (default) file 526 syn = synapseclient.Synapse( 527 configPath=CONFIG.synapse_configuration_path, 528 cache_root_dir=synapse_cache_path, 529 debug=False, 530 skip_checks=True, 531 cache_client=False, 532 ) 533 syn.login(silent=True) 534 current_span = trace.get_current_span() 535 if current_span.is_recording(): 536 current_span.set_attribute("user.id", syn.credentials.owner_id) 537 return syn
Login to Synapse
Arguments:
- access_token (Optional[str], optional): A synapse access token. Defaults to None.
- synapse_cache_path (Optional[str]): location of synapse cache
Raises:
- ValueError: If unable to loging with access token
Returns:
synapseclient.Synapse: A Synapse object that is logged in
539 def missing_entity_handler(method): 540 def wrapper(*args, **kwargs): 541 try: 542 return method(*args, **kwargs) 543 except SynapseHTTPError as ex: 544 str_message = str(ex).replace("\n", "") 545 if "trash" in str_message or "does not exist" in str_message: 546 logging.warning(str_message) 547 return None 548 else: 549 raise ex 550 551 return wrapper
553 def async_missing_entity_handler(method): 554 """Decorator to handle missing entities in async methods.""" 555 556 async def wrapper(*args: Any, **kwargs: Any) -> Any: 557 try: 558 return await method(*args, **kwargs) 559 except SynapseHTTPError as ex: 560 str_message = str(ex).replace("\n", "") 561 if "trash" in str_message or "does not exist" in str_message: 562 logging.warning(str_message) 563 return None 564 else: 565 raise ex 566 567 return wrapper
Decorator to handle missing entities in async methods.
569 def getStorageFileviewTable(self): 570 """Returns the storageFileviewTable obtained during initialization.""" 571 return self.storageFileviewTable
Returns the storageFileviewTable obtained during initialization.
573 def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]: 574 """Gets the paginated results of the REST call to Synapse to check what projects the current user has access to. 575 576 Args: 577 currentUserId: synapse id for the user whose projects we want to get. 578 579 Returns: 580 A dictionary with a next page token and the results. 581 """ 582 all_results = self.syn.restGET( 583 "/projects/user/{principalId}".format(principalId=currentUserId) 584 ) 585 586 while ( 587 "nextPageToken" in all_results 588 ): # iterate over next page token in results while there is any 589 results_token = self.syn.restGET( 590 "/projects/user/{principalId}?nextPageToken={nextPageToken}".format( 591 principalId=currentUserId, 592 nextPageToken=all_results["nextPageToken"], 593 ) 594 ) 595 all_results["results"].extend(results_token["results"]) 596 597 if "nextPageToken" in results_token: 598 all_results["nextPageToken"] = results_token["nextPageToken"] 599 else: 600 del all_results["nextPageToken"] 601 602 return all_results
Gets the paginated results of the REST call to Synapse to check what projects the current user has access to.
Arguments:
- currentUserId: synapse id for the user whose projects we want to get.
Returns:
A dictionary with a next page token and the results.
604 @tracer.start_as_current_span("SynapseStorage::getStorageProjects") 605 def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]: 606 """Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute. 607 608 Returns: 609 A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName). 610 """ 611 612 # get the set of all storage Synapse project accessible for this pipeline 613 storageProjects = self.storageFileviewTable["projectId"].unique() 614 615 # get the set of storage Synapse project accessible for this user 616 # get a list of projects from Synapse 617 current_user_project_headers = self.synapse_entity_tracker.get_project_headers( 618 current_user_id=self.syn.credentials.owner_id, syn=self.syn 619 ) 620 project_id_to_name_dict = {} 621 current_user_projects = [] 622 for project_header in current_user_project_headers: 623 project_id_to_name_dict[project_header.get("id")] = project_header.get( 624 "name" 625 ) 626 current_user_projects.append(project_header.get("id")) 627 628 # find set of user projects that are also in this pipeline's storage projects set 629 storageProjects = list(set(storageProjects) & set(current_user_projects)) 630 631 # Limit projects to scope if specified 632 if project_scope: 633 storageProjects = list(set(storageProjects) & set(project_scope)) 634 635 if not storageProjects: 636 raise Warning( 637 f"There are no projects that the user has access to that match the criteria of the specified project scope: {project_scope}" 638 ) 639 640 # prepare a return list of project IDs and names 641 projects = [] 642 for projectId in storageProjects: 643 project_name_from_project_header = project_id_to_name_dict.get(projectId) 644 projects.append((projectId, project_name_from_project_header)) 645 646 sorted_projects_list = sorted(projects, key=lambda tup: tup[0]) 647 648 return sorted_projects_list
Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute.
Returns:
A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName).
650 @tracer.start_as_current_span("SynapseStorage::getStorageDatasetsInProject") 651 def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]: 652 """Gets all datasets in folder under a given storage project that the current user has access to. 653 654 Args: 655 projectId: synapse ID of a storage project. 656 657 Returns: 658 A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName). 659 None: If the projectId cannot be found on Synapse. 660 """ 661 662 # select all folders and fetch their names from within the storage project; 663 # if folder content type is defined, only select folders that contain datasets 664 if "contentType" in self.storageFileviewTable.columns: 665 foldersTable = self.storageFileviewTable[ 666 (self.storageFileviewTable["contentType"] == "dataset") 667 & (self.storageFileviewTable["projectId"] == projectId) 668 ] 669 else: 670 foldersTable = self.storageFileviewTable[ 671 (self.storageFileviewTable["type"] == "folder") 672 & (self.storageFileviewTable["parentId"] == projectId) 673 ] 674 675 # get an array of tuples (folderId, folderName) 676 # some folders are part of datasets; others contain datasets 677 # each dataset parent is the project; folders part of a dataset have another folder as a parent 678 # to get folders if and only if they contain datasets for each folder 679 # check if folder's parent is the project; if so that folder contains a dataset, 680 # unless the folder list has already been filtered to dataset folders based on contentType attribute above 681 682 datasetList = [] 683 folderProperties = ["id", "name"] 684 for folder in list( 685 foldersTable[folderProperties].itertuples(index=False, name=None) 686 ): 687 datasetList.append(folder) 688 689 sorted_dataset_list = sorted(datasetList, key=lambda tup: tup[0]) 690 691 return sorted_dataset_list
Gets all datasets in folder under a given storage project that the current user has access to.
Arguments:
- projectId: synapse ID of a storage project.
Returns:
A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName). None: If the projectId cannot be found on Synapse.
693 @tracer.start_as_current_span("SynapseStorage::getFilesInStorageDataset") 694 def getFilesInStorageDataset( 695 self, datasetId: str, fileNames: List = None, fullpath: bool = True 696 ) -> List[Tuple[str, str]]: 697 """Gets all files (excluding manifest files) in a given dataset folder. 698 699 Args: 700 datasetId: synapse ID of a storage dataset. 701 fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g. 702 metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present. 703 fullpath: if True return the full path as part of this filename; otherwise return just base filename 704 705 Returns: 706 A list of files; the list consists of tuples (fileId, fileName). 707 708 Raises: 709 ValueError: Dataset ID not found. 710 """ 711 file_list = [] 712 713 # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view 714 if self.storageFileviewTable.empty: 715 raise ValueError( 716 f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again." 717 ) 718 719 child_path = self.storageFileviewTable.loc[ 720 self.storageFileviewTable["parentId"] == datasetId, "path" 721 ] 722 if child_path.empty: 723 raise LookupError( 724 f"Dataset {datasetId} could not be found in fileview {self.storageFileview}." 725 ) 726 child_path = child_path.iloc[0] 727 728 # Get the dataset path by eliminating the child's portion of the path to account for nested datasets 729 parent = child_path.split("/")[:-1] 730 parent = "/".join(parent) 731 732 # Format dataset path to be used in table query 733 dataset_path = f"'{parent}/%'" 734 735 # When querying, only include files to exclude entity files and subdirectories 736 where_clauses = [f"path like {dataset_path}", "type='file'"] 737 738 # Requery the fileview to specifically get the files in the given dataset 739 self.query_fileview(columns=["id", "path"], where_clauses=where_clauses) 740 741 # Exclude manifest files 742 non_manifest_files = self.storageFileviewTable.loc[ 743 ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"), 744 :, 745 ] 746 747 # Remove all files that are not in the list of fileNames 748 if fileNames: 749 filename_regex = "|".join(fileNames) 750 751 matching_files = non_manifest_files["path"].str.contains( 752 filename_regex, case=False, regex=True 753 ) 754 755 non_manifest_files = non_manifest_files.loc[matching_files, :] 756 757 # Truncate path if necessary 758 if not fullpath: 759 non_manifest_files.path = non_manifest_files.path.apply(os.path.basename) 760 761 # Return list of files as expected by other methods 762 file_list = list(non_manifest_files.itertuples(index=False, name=None)) 763 764 return file_list
Gets all files (excluding manifest files) in a given dataset folder.
Arguments:
- datasetId: synapse ID of a storage dataset.
- fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g.
- metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present.
- fullpath: if True return the full path as part of this filename; otherwise return just base filename
Returns:
A list of files; the list consists of tuples (fileId, fileName).
Raises:
- ValueError: Dataset ID not found.
791 @tracer.start_as_current_span("SynapseStorage::getDatasetManifest") 792 def getDatasetManifest( 793 self, 794 datasetId: str, 795 downloadFile: bool = False, 796 newManifestName: str = "", 797 use_temporary_folder: bool = True, 798 ) -> Union[str, File]: 799 """Gets the manifest associated with a given dataset. 800 801 Args: 802 datasetId: synapse ID of a storage dataset. 803 downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not. 804 newManifestName: new name of a manifest that gets downloaded 805 use_temporary_folder: boolean argument indicating if a temporary folder 806 should be used to store the manifest file. This is useful when running 807 this code as an API server where multiple requests could be made at the 808 same time. This is set to False when the code is being used from the 809 CLI. Defaults to True. 810 811 Returns: 812 manifest_syn_id (String): Synapse ID of exisiting manifest file. 813 manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True. 814 "" (String): No pre-exisiting manifest in dataset. 815 """ 816 manifest_data = "" 817 818 # get a list of files containing the manifest for this dataset (if any) 819 all_files = self.storageFileviewTable 820 821 # construct regex based on manifest basename in the config 822 manifest_re = re.compile(os.path.basename(self.manifest) + ".*.[tc]sv") 823 824 # search manifest based on given manifest basename regex above 825 # and return a dataframe containing name and id of manifests in a given asset view 826 manifest = all_files[ 827 (all_files["name"].str.contains(manifest_re, regex=True)) 828 & (all_files["parentId"] == datasetId) 829 ] 830 831 manifest = manifest[["id", "name"]] 832 833 # if there is no pre-exisiting manifest in the specified dataset 834 if manifest.empty: 835 logger.warning( 836 f"Could not find a manifest that fits basename {self.manifest} in asset view and dataset {datasetId}" 837 ) 838 return "" 839 840 # if there is an exisiting manifest 841 else: 842 manifest_syn_id = self._get_manifest_id(manifest) 843 if downloadFile: 844 md = ManifestDownload( 845 self.syn, 846 manifest_id=manifest_syn_id, 847 synapse_entity_tracker=self.synapse_entity_tracker, 848 ) 849 manifest_data = md.download_manifest( 850 newManifestName=newManifestName, 851 manifest_df=manifest, 852 use_temporary_folder=use_temporary_folder, 853 ) 854 # TO DO: revisit how downstream code handle manifest_data. If the downstream code would break when manifest_data is an empty string, 855 # then we should catch the error here without returning an empty string. 856 if not manifest_data: 857 logger.debug( 858 f"No manifest data returned. Please check if you have successfully downloaded manifest: {manifest_syn_id}" 859 ) 860 return manifest_data 861 return manifest_syn_id
Gets the manifest associated with a given dataset.
Arguments:
- datasetId: synapse ID of a storage dataset.
- downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not.
- newManifestName: new name of a manifest that gets downloaded
- use_temporary_folder: boolean argument indicating if a temporary folder should be used to store the manifest file. This is useful when running this code as an API server where multiple requests could be made at the same time. This is set to False when the code is being used from the CLI. Defaults to True.
Returns:
manifest_syn_id (String): Synapse ID of exisiting manifest file. manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True. "" (String): No pre-exisiting manifest in dataset.
863 def getDataTypeFromManifest(self, manifestId: str): 864 """Fetch a manifest and return data types of all columns 865 Args: 866 manifestId: synapse ID of a manifest 867 """ 868 # get manifest file path 869 manifest_entity = self.synapse_entity_tracker.get( 870 synapse_id=manifestId, syn=self.syn, download_file=True 871 ) 872 manifest_filepath = manifest_entity.path 873 874 # load manifest dataframe 875 manifest = load_df( 876 manifest_filepath, 877 preserve_raw_input=False, 878 data_model=False, 879 ) 880 881 # convert the dataFrame to use best possible dtypes. 882 manifest_new = manifest.convert_dtypes() 883 884 # get data types of columns 885 result = manifest_new.dtypes.to_frame("dtypes").reset_index() 886 887 # return the result as a dictionary 888 result_dict = result.set_index("index")["dtypes"].astype(str).to_dict() 889 890 return result_dict
Fetch a manifest and return data types of all columns
Arguments:
- manifestId: synapse ID of a manifest
914 def add_entity_id_and_filename( 915 self, datasetId: str, manifest: pd.DataFrame 916 ) -> pd.DataFrame: 917 """add entityid and filename column to an existing manifest assuming entityId column is not already present 918 919 Args: 920 datasetId (str): dataset syn id 921 manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty 922 923 Returns: 924 pd.DataFrame: returns a pandas dataframe 925 """ 926 # get file names and entity ids of a given dataset 927 dataset_files_dict = self._get_files_metadata_from_dataset( 928 datasetId, only_new_files=False 929 ) 930 931 if dataset_files_dict: 932 # turn manifest dataframe back to a dictionary for operation 933 manifest_dict = manifest.to_dict("list") 934 935 # update Filename column 936 # add entityId column to the end 937 manifest_dict.update(dataset_files_dict) 938 939 # if the component column exists in existing manifest, fill up that column 940 if "Component" in manifest_dict.keys(): 941 manifest_dict["Component"] = manifest_dict["Component"] * max( 942 1, len(manifest_dict["Filename"]) 943 ) 944 945 # turn dictionary back to a dataframe 946 manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient="index") 947 manifest_df_updated = manifest_df_index.transpose() 948 949 # fill na with empty string 950 manifest_df_updated = manifest_df_updated.fillna("") 951 952 # drop index 953 manifest_df_updated = manifest_df_updated.reset_index(drop=True) 954 955 return manifest_df_updated 956 else: 957 return manifest
add entityid and filename column to an existing manifest assuming entityId column is not already present
Arguments:
- datasetId (str): dataset syn id
- manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty
Returns:
pd.DataFrame: returns a pandas dataframe
959 def fill_in_entity_id_filename( 960 self, datasetId: str, manifest: pd.DataFrame 961 ) -> Tuple[List, pd.DataFrame]: 962 """fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present. 963 964 Args: 965 datasetId (str): dataset syn id 966 manifest (pd.DataFrame): existing manifest dataframe. 967 968 Returns: 969 Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe 970 """ 971 # get dataset file names and entity id as a list of tuple 972 dataset_files = self.getFilesInStorageDataset(datasetId) 973 974 # update manifest with additional filenames, if any 975 # note that if there is an existing manifest and there are files in the dataset 976 # the columns Filename and entityId are assumed to be present in manifest schema 977 # TODO: use idiomatic panda syntax 978 if not dataset_files: 979 manifest = manifest.fillna("") 980 return dataset_files, manifest 981 982 all_files = self._get_file_entityIds( 983 dataset_files=dataset_files, only_new_files=False, manifest=manifest 984 ) 985 new_files = self._get_file_entityIds( 986 dataset_files=dataset_files, only_new_files=True, manifest=manifest 987 ) 988 989 all_files = pd.DataFrame(all_files) 990 new_files = pd.DataFrame(new_files) 991 992 # update manifest so that it contains new dataset files 993 manifest = ( 994 pd.concat([manifest, new_files], sort=False) 995 .reset_index() 996 .drop("index", axis=1) 997 ) 998 999 # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata 1000 manifest_reindex = manifest.set_index("entityId") 1001 all_files_reindex = all_files.set_index("entityId") 1002 all_files_reindex_like_manifest = all_files_reindex.reindex_like( 1003 manifest_reindex 1004 ) 1005 1006 # Check if individual file paths in manifest and from synapse match 1007 file_paths_match = ( 1008 manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"] 1009 ) 1010 1011 # If all the paths do not match, update the manifest with the filepaths from synapse 1012 if not file_paths_match.all(): 1013 manifest_reindex.loc[ 1014 ~file_paths_match, "Filename" 1015 ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"] 1016 1017 # reformat manifest for further use 1018 manifest = manifest_reindex.reset_index() 1019 entityIdCol = manifest.pop("entityId") 1020 manifest.insert(len(manifest.columns), "entityId", entityIdCol) 1021 1022 manifest = manifest.fillna("") 1023 return dataset_files, manifest
fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present.
Arguments:
- datasetId (str): dataset syn id
- manifest (pd.DataFrame): existing manifest dataframe.
Returns:
Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe
1025 @tracer.start_as_current_span("SynapseStorage::updateDatasetManifestFiles") 1026 def updateDatasetManifestFiles( 1027 self, dmge: DataModelGraphExplorer, datasetId: str, store: bool = True 1028 ) -> Union[Tuple[str, pd.DataFrame], None]: 1029 """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any. 1030 1031 Args: 1032 dmge: DataModelGraphExplorer Instance 1033 datasetId: synapse ID of a storage dataset. 1034 store: if set to True store updated manifest in asset store; if set to False 1035 return a Pandas dataframe containing updated manifest but do not store to asset store 1036 1037 1038 Returns: 1039 Synapse ID of updated manifest and Pandas dataframe containing the updated manifest. 1040 If there is no existing manifest or if the manifest does not have an entityId column, return None 1041 """ 1042 1043 # get existing manifest Synapse ID 1044 manifest_id = self.getDatasetManifest(datasetId) 1045 1046 # if there is no manifest return None 1047 if not manifest_id: 1048 return None 1049 1050 manifest_entity = self.synapse_entity_tracker.get( 1051 synapse_id=manifest_id, syn=self.syn, download_file=True 1052 ) 1053 manifest_filepath = manifest_entity.path 1054 manifest = load_df(manifest_filepath) 1055 1056 # If the manifest does not have an entityId column, trigger a new manifest to be generated 1057 if "entityId" not in manifest.columns: 1058 return None 1059 1060 manifest_is_file_based = "Filename" in manifest.columns 1061 1062 if manifest_is_file_based: 1063 # update manifest with additional filenames, if any 1064 # note that if there is an existing manifest and there are files in the dataset 1065 # the columns Filename and entityId are assumed to be present in manifest schema 1066 # TODO: use idiomatic panda syntax 1067 dataset_files, manifest = self.fill_in_entity_id_filename( 1068 datasetId, manifest 1069 ) 1070 if dataset_files: 1071 # update the manifest file, so that it contains the relevant entity IDs 1072 if store: 1073 manifest.to_csv(manifest_filepath, index=False) 1074 1075 # store manifest and update associated metadata with manifest on Synapse 1076 manifest_id = self.associateMetadataWithFiles( 1077 dmge, manifest_filepath, datasetId 1078 ) 1079 1080 return manifest_id, manifest
Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any.
Arguments:
- dmge: DataModelGraphExplorer Instance
- datasetId: synapse ID of a storage dataset.
- store: if set to True store updated manifest in asset store; if set to False
- return a Pandas dataframe containing updated manifest but do not store to asset store
Returns:
Synapse ID of updated manifest and Pandas dataframe containing the updated manifest. If there is no existing manifest or if the manifest does not have an entityId column, return None
1126 @tracer.start_as_current_span("SynapseStorage::getProjectManifests") 1127 def getProjectManifests( 1128 self, projectId: str 1129 ) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]: 1130 """Gets all metadata manifest files across all datasets in a specified project. 1131 1132 Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest 1133 as a list of tuples, one for each manifest: 1134 [ 1135 ( 1136 (datasetId, dataName), 1137 (manifestId, manifestName), 1138 (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema 1139 ), 1140 ... 1141 ] 1142 1143 TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface 1144 """ 1145 component = None 1146 entity = None 1147 manifests = [] 1148 1149 datasets = self.getStorageDatasetsInProject(projectId) 1150 1151 for datasetId, datasetName in datasets: 1152 # encode information about the manifest in a simple list (so that R clients can unpack it) 1153 # eventually can serialize differently 1154 1155 # Get synID of manifest for a dataset 1156 manifestId = self.getDatasetManifest(datasetId) 1157 1158 # If a manifest exists, get the annotations for it, else return base 'manifest' tuple 1159 if manifestId: 1160 annotations = self.getFileAnnotations(manifestId) 1161 1162 # If manifest has annotations specifying component, use that 1163 if annotations and "Component" in annotations: 1164 component = annotations["Component"] 1165 entity = self.synapse_entity_tracker.get( 1166 synapse_id=manifestId, syn=self.syn, download_file=False 1167 ) 1168 manifest_name = entity["properties"]["name"] 1169 1170 # otherwise download the manifest and parse for information 1171 elif not annotations or "Component" not in annotations: 1172 logging.debug( 1173 f"No component annotations have been found for manifest {manifestId}. " 1174 "The manifest will be downloaded and parsed instead. " 1175 "For increased speed, add component annotations to manifest." 1176 ) 1177 1178 manifest_info = self.getDatasetManifest( 1179 datasetId, downloadFile=True 1180 ) 1181 manifest_name = manifest_info["properties"].get("name", "") 1182 1183 if not manifest_name: 1184 logger.error(f"Failed to download manifests from {datasetId}") 1185 1186 manifest_path = manifest_info["path"] 1187 1188 manifest_df = load_df(manifest_path) 1189 1190 # Get component from component column if it exists 1191 if ( 1192 "Component" in manifest_df 1193 and not manifest_df["Component"].empty 1194 ): 1195 list(set(manifest_df["Component"])) 1196 component = list(set(manifest_df["Component"])) 1197 1198 # Added to address issues raised during DCA testing 1199 if "" in component: 1200 component.remove("") 1201 1202 if len(component) == 1: 1203 component = component[0] 1204 elif len(component) > 1: 1205 logging.warning( 1206 f"Manifest {manifestId} is composed of multiple components. Schematic does not support mulit-component manifests at this time." 1207 "Behavior of manifests with multiple components is undefined" 1208 ) 1209 else: 1210 manifest_name = "" 1211 component = None 1212 if component: 1213 manifest = ( 1214 (datasetId, datasetName), 1215 (manifestId, manifest_name), 1216 (component, component), 1217 ) 1218 elif manifestId: 1219 logging.debug( 1220 f"Manifest {manifestId} does not have an associated Component" 1221 ) 1222 manifest = ( 1223 (datasetId, datasetName), 1224 (manifestId, manifest_name), 1225 ("", ""), 1226 ) 1227 else: 1228 manifest = ( 1229 (datasetId, datasetName), 1230 ("", ""), 1231 ("", ""), 1232 ) 1233 1234 if manifest: 1235 manifests.append(manifest) 1236 1237 return manifests
Gets all metadata manifest files across all datasets in a specified project.
Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest as a list of tuples, one for each manifest: [ ( (datasetId, dataName), (manifestId, manifestName), (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema ), ... ]
TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface
1239 def upload_project_manifests_to_synapse( 1240 self, dmge: DataModelGraphExplorer, projectId: str 1241 ) -> List[str]: 1242 """Upload all metadata manifest files across all datasets in a specified project as tables in Synapse. 1243 1244 Returns: String of all the manifest_table_ids of all the manifests that have been loaded. 1245 """ 1246 1247 manifests = [] 1248 manifest_loaded = [] 1249 datasets = self.getStorageDatasetsInProject(projectId) 1250 1251 for datasetId, datasetName in datasets: 1252 # encode information about the manifest in a simple list (so that R clients can unpack it) 1253 # eventually can serialize differently 1254 1255 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1256 1257 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1258 if manifest_info: 1259 manifest_id = manifest_info["properties"]["id"] 1260 manifest_name = manifest_info["properties"]["name"] 1261 manifest_path = manifest_info["path"] 1262 manifest_df = load_df(manifest_path) 1263 manifest_table_id = uploadDB( 1264 dmge=dmge, 1265 manifest=manifest, 1266 datasetId=datasetId, 1267 table_name=datasetName, 1268 ) 1269 manifest_loaded.append(datasetName) 1270 return manifest_loaded
Upload all metadata manifest files across all datasets in a specified project as tables in Synapse.
Returns: String of all the manifest_table_ids of all the manifests that have been loaded.
1272 def upload_annotated_project_manifests_to_synapse( 1273 self, projectId: str, path_to_json_ld: str, dry_run: bool = False 1274 ) -> List[str]: 1275 """ 1276 Purpose: 1277 For all manifests in a project, upload them as a table and add annotations manifest csv. 1278 Assumes the manifest is already present as a CSV in a dataset in the project. 1279 1280 """ 1281 # Instantiate DataModelParser 1282 data_model_parser = DataModelParser(path_to_data_model=path_to_json_ld) 1283 # Parse Model 1284 parsed_data_model = data_model_parser.parse_model() 1285 1286 # Instantiate DataModelGraph 1287 data_model_grapher = DataModelGraph(parsed_data_model) 1288 1289 # Generate graph 1290 graph_data_model = data_model_grapher.generate_data_model_graph() 1291 1292 # Instantiate DataModelGraphExplorer 1293 dmge = DataModelGraphExplorer(graph_data_model) 1294 1295 manifests = [] 1296 manifest_loaded = [] 1297 datasets = self.getStorageDatasetsInProject(projectId) 1298 for datasetId, datasetName in datasets: 1299 # encode information about the manifest in a simple list (so that R clients can unpack it) 1300 # eventually can serialize differently 1301 1302 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1303 manifests.append(manifest) 1304 1305 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1306 1307 if manifest_info: 1308 manifest_id = manifest_info["properties"]["id"] 1309 manifest_name = manifest_info["properties"]["name"] 1310 manifest_path = manifest_info["path"] 1311 manifest = ( 1312 (datasetId, datasetName), 1313 (manifest_id, manifest_name), 1314 ("", ""), 1315 ) 1316 if not dry_run: 1317 self.associateMetadataWithFiles( 1318 dmge, manifest_path, datasetId, manifest_record_type="table" 1319 ) 1320 manifest_loaded.append(manifest) 1321 1322 return manifests, manifest_loaded
Purpose:
For all manifests in a project, upload them as a table and add annotations manifest csv. Assumes the manifest is already present as a CSV in a dataset in the project.
1324 def move_entities_to_new_project( 1325 self, 1326 projectId: str, 1327 newProjectId: str, 1328 returnEntities: bool = False, 1329 dry_run: bool = False, 1330 ): 1331 """ 1332 For each manifest csv in a project, look for all the entitiy ids that are associated. 1333 Look up the entitiy in the files, move the entity to new project. 1334 """ 1335 1336 manifests = [] 1337 manifest_loaded = [] 1338 datasets = self.getStorageDatasetsInProject(projectId) 1339 if datasets: 1340 for datasetId, datasetName in datasets: 1341 # encode information about the manifest in a simple list (so that R clients can unpack it) 1342 # eventually can serialize differently 1343 1344 manifest = ((datasetId, datasetName), ("", ""), ("", "")) 1345 manifests.append(manifest) 1346 1347 manifest_info = self.getDatasetManifest(datasetId, downloadFile=True) 1348 if manifest_info: 1349 manifest_id = manifest_info["properties"]["id"] 1350 manifest_name = manifest_info["properties"]["name"] 1351 manifest_path = manifest_info["path"] 1352 manifest_df = load_df(manifest_path) 1353 1354 manifest = ( 1355 (datasetId, datasetName), 1356 (manifest_id, manifest_name), 1357 ("", ""), 1358 ) 1359 manifest_loaded.append(manifest) 1360 1361 annotation_entities = self.storageFileviewTable[ 1362 (self.storageFileviewTable["id"].isin(manifest_df["entityId"])) 1363 & (self.storageFileviewTable["type"] == "folder") 1364 ]["id"] 1365 1366 if returnEntities: 1367 for entityId in annotation_entities: 1368 if not dry_run: 1369 moved_entity = self.syn.move(entityId, datasetId) 1370 self.synapse_entity_tracker.add( 1371 synapse_id=moved_entity.id, entity=moved_entity 1372 ) 1373 else: 1374 logging.info( 1375 f"{entityId} will be moved to folder {datasetId}." 1376 ) 1377 else: 1378 # generate project folder 1379 archive_project_folder = Folder( 1380 projectId + "_archive", parent=newProjectId 1381 ) 1382 archive_project_folder = self.syn.store(archive_project_folder) 1383 self.synapse_entity_tracker.add( 1384 synapse_id=archive_project_folder.id, 1385 entity=archive_project_folder, 1386 ) 1387 1388 # generate dataset folder 1389 dataset_archive_folder = Folder( 1390 "_".join([datasetId, datasetName, "archive"]), 1391 parent=archive_project_folder.id, 1392 ) 1393 dataset_archive_folder = self.syn.store(dataset_archive_folder) 1394 self.synapse_entity_tracker.add( 1395 synapse_id=dataset_archive_folder.id, 1396 entity=dataset_archive_folder, 1397 ) 1398 1399 for entityId in annotation_entities: 1400 # move entities to folder 1401 if not dry_run: 1402 moved_entity = self.syn.move( 1403 entityId, dataset_archive_folder.id 1404 ) 1405 self.synapse_entity_tracker.add( 1406 synapse_id=moved_entity.id, entity=moved_entity 1407 ) 1408 else: 1409 logging.info( 1410 f"{entityId} will be moved to folder {dataset_archive_folder.id}." 1411 ) 1412 else: 1413 raise LookupError( 1414 f"No datasets were found in the specified project: {projectId}. Re-check specified master_fileview in CONFIG and retry." 1415 ) 1416 return manifests, manifest_loaded
For each manifest csv in a project, look for all the entitiy ids that are associated. Look up the entitiy in the files, move the entity to new project.
1418 @tracer.start_as_current_span("SynapseStorage::get_synapse_table") 1419 def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable]: 1420 """Download synapse table as a pd dataframe; return table schema and etags as results too 1421 1422 Args: 1423 synapse_id: synapse ID of the table to query 1424 """ 1425 1426 results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id)) 1427 df = results.asDataFrame( 1428 rowIdAndVersionInIndex=False, 1429 na_values=STR_NA_VALUES_FILTERED, 1430 keep_default_na=False, 1431 ) 1432 1433 return df, results
Download synapse table as a pd dataframe; return table schema and etags as results too
Arguments:
- synapse_id: synapse ID of the table to query
540 def wrapper(*args, **kwargs): 541 try: 542 return method(*args, **kwargs) 543 except SynapseHTTPError as ex: 544 str_message = str(ex).replace("\n", "") 545 if "trash" in str_message or "does not exist" in str_message: 546 logging.warning(str_message) 547 return None 548 else: 549 raise ex
Method to upload a database to an asset store. In synapse, this will upload a metadata table
Arguments:
- dmge: DataModelGraphExplorer object
- manifest: pd.Df manifest to upload
- datasetId: synID of the dataset for the manifest
- table_name: name of the table to be uploaded
- restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
- existingTableId: str of the synId of the existing table, if one already exists
- table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
- table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
Returns:
manifest_table_id: synID of the uploaded table manifest: the original manifset table_manifest: manifest formatted appropriately for the table
1484 @tracer.start_as_current_span("SynapseStorage::formatDB") 1485 def formatDB(self, dmge, manifest, table_column_names): 1486 """ 1487 Method to format a manifest appropriatly for upload as table 1488 1489 Args: 1490 dmge: DataModelGraphExplorer object 1491 manifest: pd.Df manifest to upload 1492 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 1493 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 1494 display label formatting. 1495 Returns: 1496 col_schema: schema for table columns: type, size, etc 1497 table_manifest: formatted manifest 1498 1499 """ 1500 # Rename the manifest columns to display names to match fileview 1501 1502 blacklist_chars = ["(", ")", ".", " ", "-"] 1503 manifest_columns = manifest.columns.tolist() 1504 1505 table_manifest = deepcopy(manifest) 1506 1507 if table_column_names == "display_name": 1508 cols = table_manifest.columns 1509 1510 elif table_column_names == "display_label": 1511 cols = [ 1512 str(col).translate({ord(x): "" for x in blacklist_chars}) 1513 for col in manifest_columns 1514 ] 1515 1516 elif table_column_names == "class_label": 1517 cols = [ 1518 get_class_label_from_display_name(str(col)).translate( 1519 {ord(x): "" for x in blacklist_chars} 1520 ) 1521 for col in manifest_columns 1522 ] 1523 else: 1524 ValueError( 1525 f"The provided table_column_name: {table_column_names} is not valid, please resubmit with an allowed value only." 1526 ) 1527 1528 cols = list(map(lambda x: x.replace("EntityId", "entityId"), cols)) 1529 1530 # Reset column names in table manifest 1531 table_manifest.columns = cols 1532 1533 # move entity id to end of df 1534 entity_col = table_manifest.pop("entityId") 1535 table_manifest.insert(len(table_manifest.columns), "entityId", entity_col) 1536 1537 # Get the column schema 1538 col_schema = as_table_columns(table_manifest) 1539 1540 # Set Id column length to 64 (for some reason not being auto set.) 1541 for i, col in enumerate(col_schema): 1542 if col["name"].lower() == "id": 1543 col_schema[i]["maximumSize"] = 64 1544 1545 return col_schema, table_manifest
Method to format a manifest appropriatly for upload as table
Arguments:
- dmge: DataModelGraphExplorer object
- manifest: pd.Df manifest to upload
- table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
Returns:
col_schema: schema for table columns: type, size, etc table_manifest: formatted manifest
1547 @tracer.start_as_current_span("SynapseStorage::buildDB") 1548 def buildDB( 1549 self, 1550 datasetId: str, 1551 table_name: str, 1552 col_schema: List, 1553 table_manifest: pd.DataFrame, 1554 table_manipulation: str, 1555 dmge: DataModelGraphExplorer, 1556 restrict: bool = False, 1557 ): 1558 """ 1559 Method to construct the table appropriately: create new table, replace existing, or upsert new into existing 1560 Calls TableOperations class to execute 1561 1562 Args: 1563 datasetId: synID of the dataset for the manifest 1564 table_name: name of the table to be uploaded 1565 col_schema: schema for table columns: type, size, etc from `formatDB` 1566 table_manifest: formatted manifest that can be uploaded as a table 1567 table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert) 1568 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 1569 1570 Returns: 1571 manifest_table_id: synID of the uploaded table 1572 1573 """ 1574 table_parent_id = self.getDatasetProject(datasetId=datasetId) 1575 existing_table_id = self.syn.findEntityId( 1576 name=table_name, parent=table_parent_id 1577 ) 1578 1579 tableOps = TableOperations( 1580 synStore=self, 1581 tableToLoad=table_manifest, 1582 tableName=table_name, 1583 datasetId=datasetId, 1584 existingTableId=existing_table_id, 1585 restrict=restrict, 1586 synapse_entity_tracker=self.synapse_entity_tracker, 1587 ) 1588 1589 if not table_manipulation or existing_table_id is None: 1590 manifest_table_id = tableOps.createTable( 1591 columnTypeDict=col_schema, 1592 specifySchema=True, 1593 ) 1594 elif existing_table_id is not None: 1595 if table_manipulation.lower() == "replace": 1596 manifest_table_id = tableOps.replaceTable( 1597 specifySchema=True, 1598 columnTypeDict=col_schema, 1599 ) 1600 elif table_manipulation.lower() == "upsert": 1601 manifest_table_id = tableOps.upsertTable( 1602 dmge=dmge, 1603 ) 1604 elif table_manipulation.lower() == "update": 1605 manifest_table_id = tableOps.updateTable() 1606 1607 if table_manipulation and table_manipulation.lower() == "upsert": 1608 table_entity = self.synapse_entity_tracker.get( 1609 synapse_id=existing_table_id or manifest_table_id, 1610 syn=self.syn, 1611 download_file=False, 1612 ) 1613 annos = OldAnnotations( 1614 id=table_entity.id, 1615 etag=table_entity.etag, 1616 values=table_entity.annotations, 1617 ) 1618 annos["primary_key"] = table_manifest["Component"][0] + "_id" 1619 annos = self.syn.set_annotations(annos) 1620 table_entity.etag = annos.etag 1621 table_entity.annotations = annos 1622 1623 return manifest_table_id
Method to construct the table appropriately: create new table, replace existing, or upsert new into existing Calls TableOperations class to execute
Arguments:
- datasetId: synID of the dataset for the manifest
- table_name: name of the table to be uploaded
- col_schema: schema for table columns: type, size, etc from
formatDB
- table_manifest: formatted manifest that can be uploaded as a table
- table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
- restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
Returns:
manifest_table_id: synID of the uploaded table
1625 @tracer.start_as_current_span("SynapseStorage::upload_manifest_file") 1626 def upload_manifest_file( 1627 self, 1628 manifest, 1629 metadataManifestPath, 1630 datasetId, 1631 restrict_manifest, 1632 component_name="", 1633 ): 1634 # Update manifest to have the new entityId column 1635 manifest.to_csv(metadataManifestPath, index=False) 1636 1637 # store manifest to Synapse as a CSV 1638 # update file name 1639 file_name_full = metadataManifestPath.split("/")[-1] 1640 file_extension = file_name_full.split(".")[-1] 1641 1642 # Differentiate "censored" and "uncensored" manifest 1643 if "censored" in file_name_full: 1644 file_name_new = ( 1645 os.path.basename(CONFIG.synapse_manifest_basename) 1646 + "_" 1647 + component_name 1648 + "_censored" 1649 + "." 1650 + file_extension 1651 ) 1652 else: 1653 file_name_new = ( 1654 os.path.basename(CONFIG.synapse_manifest_basename) 1655 + "_" 1656 + component_name 1657 + "." 1658 + file_extension 1659 ) 1660 1661 manifest_synapse_file = None 1662 try: 1663 # Rename the file to file_name_new then revert 1664 # This is to maintain the original file name in-case other code is 1665 # expecting that the file exists with the original name 1666 original_file_path = metadataManifestPath 1667 new_file_path = os.path.join( 1668 os.path.dirname(metadataManifestPath), file_name_new 1669 ) 1670 os.rename(original_file_path, new_file_path) 1671 1672 manifest_synapse_file = self._store_file_for_manifest_upload( 1673 new_file_path=new_file_path, 1674 dataset_id=datasetId, 1675 existing_file_name=file_name_full, 1676 file_name_new=file_name_new, 1677 restrict_manifest=restrict_manifest, 1678 ) 1679 manifest_synapse_file_id = manifest_synapse_file.id 1680 1681 finally: 1682 # Revert the file name back to the original 1683 os.rename(new_file_path, original_file_path) 1684 1685 if manifest_synapse_file: 1686 manifest_synapse_file.path = original_file_path 1687 1688 return manifest_synapse_file_id
1745 async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]: 1746 """get annotations asynchronously 1747 1748 Args: 1749 synapse_id (str): synapse id of the entity that the annotation belongs 1750 1751 Returns: 1752 Dict[str, Any]: The requested entity bundle matching 1753 <https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html> 1754 """ 1755 return await get_entity_id_bundle2( 1756 entity_id=synapse_id, 1757 request={"includeAnnotations": True}, 1758 synapse_client=self.syn, 1759 )
get annotations asynchronously
Arguments:
- synapse_id (str): synapse id of the entity that the annotation belongs
Returns:
Dict[str, Any]: The requested entity bundle matching https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html
1761 async def store_async_annotation(self, annotation_dict: dict) -> Annotations: 1762 """store annotation in an async way 1763 1764 Args: 1765 annotation_dict (dict): annotation in a dictionary format 1766 1767 Returns: 1768 Annotations: The stored annotations. 1769 """ 1770 annotation_data = Annotations.from_dict( 1771 synapse_annotations=annotation_dict["annotations"]["annotations"] 1772 ) 1773 annotation_class = Annotations( 1774 annotations=annotation_data, 1775 etag=annotation_dict["annotations"]["etag"], 1776 id=annotation_dict["annotations"]["id"], 1777 ) 1778 annotation_storage_result = await annotation_class.store_async( 1779 synapse_client=self.syn 1780 ) 1781 local_entity = self.synapse_entity_tracker.get( 1782 synapse_id=annotation_dict["annotations"]["id"], 1783 syn=self.syn, 1784 download_file=False, 1785 retrieve_if_not_present=False, 1786 ) 1787 if local_entity: 1788 local_entity.etag = annotation_storage_result.etag 1789 local_entity.annotations = annotation_storage_result 1790 return annotation_storage_result
store annotation in an async way
Arguments:
- annotation_dict (dict): annotation in a dictionary format
Returns:
Annotations: The stored annotations.
1792 def process_row_annotations( 1793 self, 1794 dmge: DataModelGraphExplorer, 1795 metadata_syn: Dict[str, Any], 1796 hide_blanks: bool, 1797 csv_list_regex: str, 1798 annos: Dict[str, Any], 1799 annotation_keys: str, 1800 ) -> Dict[str, Any]: 1801 """Processes metadata annotations based on the logic below: 1802 1. Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is: 1803 An empty or whitespace-only string. 1804 A NaN value (if the annotation is a float). 1805 if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key. 1806 if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key. 1807 1808 2. If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name". 1809 Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key. 1810 1811 3. For any other conditions, assigns the original value of anno_v to the annotation key (anno_k). 1812 1813 4. Returns the updated annotations dictionary. 1814 1815 Args: 1816 dmge (DataModelGraphExplorer): data model graph explorer 1817 metadata_syn (dict): metadata used for Synapse storage 1818 hideBlanks (bool): if true, does not upload annotation keys with blank values. 1819 csv_list_regex (str): Regex to match with comma separated list 1820 annos (Dict[str, Any]): dictionary of annotation returned from synapse 1821 annotation_keys (str): display_label/class_label 1822 1823 Returns: 1824 Dict[str, Any]: annotations as a dictionary 1825 1826 ```mermaid 1827 flowchart TD 1828 A[Start] --> C{Is anno_v empty, whitespace, or NaN?} 1829 C -- Yes --> D{Is hide_blanks True?} 1830 D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing] 1831 D -- No --> F[Assign empty string to annotation key] 1832 C -- No --> G{Is anno_v a string?} 1833 G -- No --> H[Assign original value of anno_v to annotation key] 1834 G -- Yes --> I{Does anno_v match csv_list_regex?} 1835 I -- Yes --> J[Get validation rule of anno_k] 1836 J --> K{Does the validation rule contain 'list'} 1837 K -- Yes --> L[Split anno_v by commas and assign as list] 1838 I -- No --> H 1839 K -- No --> H 1840 ``` 1841 """ 1842 for anno_k, anno_v in metadata_syn.items(): 1843 # Remove keys with nan or empty string values or string that only contains white space from dict of annotations to be uploaded 1844 # if present on current data annotation 1845 if hide_blanks and ( 1846 (isinstance(anno_v, str) and anno_v.strip() == "") 1847 or (isinstance(anno_v, float) and np.isnan(anno_v)) 1848 ): 1849 annos["annotations"]["annotations"].pop(anno_k) if anno_k in annos[ 1850 "annotations" 1851 ]["annotations"].keys() else annos["annotations"]["annotations"] 1852 continue 1853 1854 # Otherwise save annotation as approrpriate 1855 if isinstance(anno_v, float) and np.isnan(anno_v): 1856 annos["annotations"]["annotations"][anno_k] = "" 1857 continue 1858 1859 # Handle strings that match the csv_list_regex and pass the validation rule 1860 if isinstance(anno_v, str) and re.fullmatch(csv_list_regex, anno_v): 1861 # Use a dictionary to dynamically choose the argument 1862 param = ( 1863 {"node_display_name": anno_k} 1864 if annotation_keys == "display_label" 1865 else {"node_label": anno_k} 1866 ) 1867 node_validation_rules = dmge.get_node_validation_rules(**param) 1868 1869 if rule_in_rule_list("list", node_validation_rules): 1870 annos["annotations"]["annotations"][anno_k] = anno_v.split(",") 1871 continue 1872 # default: assign the original value 1873 annos["annotations"]["annotations"][anno_k] = anno_v 1874 1875 return annos
Processes metadata annotations based on the logic below:
- Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is: An empty or whitespace-only string. A NaN value (if the annotation is a float). if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key. if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key.
If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name". Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key.
For any other conditions, assigns the original value of anno_v to the annotation key (anno_k).
Returns the updated annotations dictionary.
Arguments:
- dmge (DataModelGraphExplorer): data model graph explorer
- metadata_syn (dict): metadata used for Synapse storage
- hideBlanks (bool): if true, does not upload annotation keys with blank values.
- csv_list_regex (str): Regex to match with comma separated list
- annos (Dict[str, Any]): dictionary of annotation returned from synapse
- annotation_keys (str): display_label/class_label
Returns:
Dict[str, Any]: annotations as a dictionary
flowchart TD A[Start] --> C{Is anno_v empty, whitespace, or NaN?} C -- Yes --> D{Is hide_blanks True?} D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing] D -- No --> F[Assign empty string to annotation key] C -- No --> G{Is anno_v a string?} G -- No --> H[Assign original value of anno_v to annotation key] G -- Yes --> I{Does anno_v match csv_list_regex?} I -- Yes --> J[Get validation rule of anno_k] J --> K{Does the validation rule contain 'list'} K -- Yes --> L[Split anno_v by commas and assign as list] I -- No --> H K -- No --> H
556 async def wrapper(*args: Any, **kwargs: Any) -> Any: 557 try: 558 return await method(*args, **kwargs) 559 except SynapseHTTPError as ex: 560 str_message = str(ex).replace("\n", "") 561 if "trash" in str_message or "does not exist" in str_message: 562 logging.warning(str_message) 563 return None 564 else: 565 raise ex
540 def wrapper(*args, **kwargs): 541 try: 542 return method(*args, **kwargs) 543 except SynapseHTTPError as ex: 544 str_message = str(ex).replace("\n", "") 545 if "trash" in str_message or "does not exist" in str_message: 546 logging.warning(str_message) 547 return None 548 else: 549 raise ex
Set annotations for the manifest (as a whole) so they can be applied to the manifest table or csv. For now just getting the Component.
2237 @tracer.start_as_current_span("SynapseStorage::add_annotations_to_entities_files") 2238 async def add_annotations_to_entities_files( 2239 self, 2240 dmge, 2241 manifest, 2242 manifest_record_type: str, 2243 datasetId: str, 2244 hideBlanks: bool, 2245 manifest_synapse_table_id="", 2246 annotation_keys: str = "class_label", 2247 ): 2248 """ 2249 Depending on upload type add Ids to entityId row. Add anotations to connected 2250 files and folders. Despite the name of this function, it also applies to folders. 2251 2252 Args: 2253 dmge: DataModelGraphExplorer Object 2254 manifest (pd.DataFrame): loaded df containing user supplied data. 2255 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2256 datasetId (str): synapse ID of folder containing the dataset 2257 hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2258 manifest_synapse_table_id (str): Default is an empty string ''. 2259 annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display 2260 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2261 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2262 Returns: 2263 manifest (pd.DataFrame): modified to add entitiyId as appropriate 2264 2265 """ 2266 2267 # Expected behavior is to annotate files if `Filename` is present and if file_annotations_upload is set to True regardless of `-mrt` setting 2268 if "filename" in [col.lower() for col in manifest.columns]: 2269 # get current list of files and store as dataframe 2270 dataset_files = self.getFilesInStorageDataset(datasetId) 2271 files_and_entityIds = self._get_file_entityIds( 2272 dataset_files=dataset_files, only_new_files=False 2273 ) 2274 file_df = pd.DataFrame(files_and_entityIds) 2275 2276 # Merge dataframes to add entityIds 2277 manifest = manifest.merge( 2278 file_df, how="left", on="Filename", suffixes=["_x", None] 2279 ).drop("entityId_x", axis=1) 2280 2281 # Fill `entityId` for each row if missing and annotate entity as appropriate 2282 requests = set() 2283 for idx, row in manifest.iterrows(): 2284 if not row["entityId"] and ( 2285 manifest_record_type == "file_and_entities" 2286 or manifest_record_type == "table_file_and_entities" 2287 ): 2288 manifest, entityId = self._create_entity_id( 2289 idx, row, manifest, datasetId 2290 ) 2291 elif not row["entityId"] and manifest_record_type == "table_and_file": 2292 # If not using entityIds, fill with manifest_table_id so 2293 row["entityId"] = manifest_synapse_table_id 2294 manifest.loc[idx, "entityId"] = manifest_synapse_table_id 2295 entityId = "" 2296 # If the row is the manifest table, do not add annotations 2297 elif row["entityId"] == manifest_synapse_table_id: 2298 entityId = "" 2299 else: 2300 # get the file id of the file to annotate, collected in above step. 2301 entityId = row["entityId"] 2302 2303 # Adding annotations to connected files. 2304 if entityId: 2305 # Format annotations for Synapse 2306 annos_task = asyncio.create_task( 2307 self.format_row_annotations( 2308 dmge, row, entityId, hideBlanks, annotation_keys 2309 ) 2310 ) 2311 requests.add(annos_task) 2312 await self._process_store_annos(requests) 2313 return manifest
Depending on upload type add Ids to entityId row. Add anotations to connected files and folders. Despite the name of this function, it also applies to folders.
Arguments:
- dmge: DataModelGraphExplorer Object
- manifest (pd.DataFrame): loaded df containing user supplied data.
- manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
- datasetId (str): synapse ID of folder containing the dataset
- hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
- manifest_synapse_table_id (str): Default is an empty string ''.
- annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
Returns:
manifest (pd.DataFrame): modified to add entitiyId as appropriate
2315 @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_table") 2316 def upload_manifest_as_table( 2317 self, 2318 dmge: DataModelGraphExplorer, 2319 manifest: pd.DataFrame, 2320 metadataManifestPath: str, 2321 datasetId: str, 2322 table_name: str, 2323 component_name: str, 2324 restrict: bool, 2325 manifest_record_type: str, 2326 hideBlanks: bool, 2327 table_manipulation: str, 2328 table_column_names: str, 2329 annotation_keys: str, 2330 file_annotations_upload: bool = True, 2331 ): 2332 """Upload manifest to Synapse as a table and csv. 2333 Args: 2334 dmge: DataModelGraphExplorer object 2335 manifest (pd.DataFrame): loaded df containing user supplied data. 2336 metadataManifestPath: path to csv containing a validated metadata manifest. 2337 datasetId (str): synapse ID of folder containing the dataset 2338 table_name (str): Generated to name the table being uploaded. 2339 component_name (str): Name of the component manifest that is currently being uploaded. 2340 restrict (bool): Flag for censored data. 2341 manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2342 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2343 table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2344 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2345 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2346 display label formatting. 2347 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2348 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2349 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2350 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2351 Return: 2352 manifest_synapse_file_id: SynID of manifest csv uploaded to synapse. 2353 """ 2354 # Upload manifest as a table, get the ID and updated manifest. 2355 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2356 dmge=dmge, 2357 manifest=manifest, 2358 datasetId=datasetId, 2359 table_name=table_name, 2360 restrict=restrict, 2361 table_manipulation=table_manipulation, 2362 table_column_names=table_column_names, 2363 ) 2364 2365 if file_annotations_upload: 2366 manifest = asyncio.run( 2367 self.add_annotations_to_entities_files( 2368 dmge, 2369 manifest, 2370 manifest_record_type, 2371 datasetId, 2372 hideBlanks, 2373 manifest_synapse_table_id, 2374 annotation_keys, 2375 ) 2376 ) 2377 # Load manifest to synapse as a CSV File 2378 manifest_synapse_file_id = self.upload_manifest_file( 2379 manifest=manifest, 2380 metadataManifestPath=metadataManifestPath, 2381 datasetId=datasetId, 2382 restrict_manifest=restrict, 2383 component_name=component_name, 2384 ) 2385 2386 # Set annotations for the file manifest. 2387 manifest_annotations = self.format_manifest_annotations( 2388 manifest=manifest, manifest_synapse_id=manifest_synapse_file_id 2389 ) 2390 annos = self.syn.set_annotations(annotations=manifest_annotations) 2391 manifest_entity = self.synapse_entity_tracker.get( 2392 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2393 ) 2394 manifest_entity.annotations = annos 2395 manifest_entity.etag = annos.etag 2396 2397 logger.info("Associated manifest file with dataset on Synapse.") 2398 2399 # Update manifest Synapse table with new entity id column. 2400 manifest_synapse_table_id, manifest, _ = self.uploadDB( 2401 dmge=dmge, 2402 manifest=manifest, 2403 datasetId=datasetId, 2404 table_name=table_name, 2405 restrict=restrict, 2406 table_manipulation="update", 2407 table_column_names=table_column_names, 2408 ) 2409 2410 # Set annotations for the table manifest 2411 manifest_annotations = self.format_manifest_annotations( 2412 manifest=manifest, manifest_synapse_id=manifest_synapse_table_id 2413 ) 2414 annotations_manifest_table = self.syn.set_annotations( 2415 annotations=manifest_annotations 2416 ) 2417 manifest_table_entity = self.synapse_entity_tracker.get( 2418 synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False 2419 ) 2420 manifest_table_entity.annotations = annotations_manifest_table 2421 manifest_table_entity.etag = annotations_manifest_table.etag 2422 2423 return manifest_synapse_file_id
Upload manifest to Synapse as a table and csv.
Arguments:
- dmge: DataModelGraphExplorer object
- manifest (pd.DataFrame): loaded df containing user supplied data.
- metadataManifestPath: path to csv containing a validated metadata manifest.
- datasetId (str): synapse ID of folder containing the dataset
- table_name (str): Generated to name the table being uploaded.
- component_name (str): Name of the component manifest that is currently being uploaded.
- restrict (bool): Flag for censored data.
- manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
- hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
- table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
- table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
- annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
- file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
Return:
manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2425 @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_csv") 2426 def upload_manifest_as_csv( 2427 self, 2428 dmge, 2429 manifest, 2430 metadataManifestPath, 2431 datasetId, 2432 restrict, 2433 manifest_record_type, 2434 hideBlanks, 2435 component_name, 2436 annotation_keys: str, 2437 file_annotations_upload: bool = True, 2438 ): 2439 """Upload manifest to Synapse as a csv only. 2440 Args: 2441 dmge: DataModelGraphExplorer object 2442 manifest (pd.DataFrame): loaded df containing user supplied data. 2443 metadataManifestPath: path to csv containing a validated metadata manifest. 2444 datasetId (str): synapse ID of folder containing the dataset 2445 restrict (bool): Flag for censored data. 2446 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2447 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2448 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2449 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2450 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2451 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2452 Return: 2453 manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. 2454 """ 2455 if file_annotations_upload: 2456 manifest = asyncio.run( 2457 self.add_annotations_to_entities_files( 2458 dmge, 2459 manifest, 2460 manifest_record_type, 2461 datasetId, 2462 hideBlanks, 2463 annotation_keys=annotation_keys, 2464 ) 2465 ) 2466 2467 # Load manifest to synapse as a CSV File 2468 manifest_synapse_file_id = self.upload_manifest_file( 2469 manifest, 2470 metadataManifestPath, 2471 datasetId, 2472 restrict, 2473 component_name=component_name, 2474 ) 2475 2476 # Set annotations for the file manifest. 2477 manifest_annotations = self.format_manifest_annotations( 2478 manifest, manifest_synapse_file_id 2479 ) 2480 annos = self.syn.set_annotations(manifest_annotations) 2481 manifest_entity = self.synapse_entity_tracker.get( 2482 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2483 ) 2484 manifest_entity.annotations = annos 2485 manifest_entity.etag = annos.etag 2486 2487 logger.info("Associated manifest file with dataset on Synapse.") 2488 2489 return manifest_synapse_file_id
Upload manifest to Synapse as a csv only.
Arguments:
- dmge: DataModelGraphExplorer object
- manifest (pd.DataFrame): loaded df containing user supplied data.
- metadataManifestPath: path to csv containing a validated metadata manifest.
- datasetId (str): synapse ID of folder containing the dataset
- restrict (bool): Flag for censored data.
- manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
- hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
- annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
- file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
Return:
manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2491 @tracer.start_as_current_span("SynapseStorage::upload_manifest_combo") 2492 def upload_manifest_combo( 2493 self, 2494 dmge, 2495 manifest, 2496 metadataManifestPath, 2497 datasetId, 2498 table_name, 2499 component_name, 2500 restrict, 2501 manifest_record_type, 2502 hideBlanks, 2503 table_manipulation, 2504 table_column_names: str, 2505 annotation_keys: str, 2506 file_annotations_upload: bool = True, 2507 ): 2508 """Upload manifest to Synapse as a table and CSV with entities. 2509 Args: 2510 dmge: DataModelGraphExplorer object 2511 manifest (pd.DataFrame): loaded df containing user supplied data. 2512 metadataManifestPath: path to csv containing a validated metadata manifest. 2513 datasetId (str): synapse ID of folder containing the dataset 2514 table_name (str): Generated to name the table being uploaded. 2515 component_name (str): Name of the component manifest that is currently being uploaded. 2516 restrict (bool): Flag for censored data. 2517 manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both. 2518 hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2519 table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2520 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2521 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2522 display label formatting. 2523 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2524 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2525 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2526 file_annotations_upload (bool): Default to True. If false, do not add annotations to files. 2527 Return: 2528 manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse. 2529 """ 2530 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2531 dmge=dmge, 2532 manifest=manifest, 2533 datasetId=datasetId, 2534 table_name=table_name, 2535 restrict=restrict, 2536 table_manipulation=table_manipulation, 2537 table_column_names=table_column_names, 2538 ) 2539 2540 if file_annotations_upload: 2541 manifest = asyncio.run( 2542 self.add_annotations_to_entities_files( 2543 dmge, 2544 manifest, 2545 manifest_record_type, 2546 datasetId, 2547 hideBlanks, 2548 manifest_synapse_table_id, 2549 annotation_keys=annotation_keys, 2550 ) 2551 ) 2552 2553 # Load manifest to synapse as a CSV File 2554 manifest_synapse_file_id = self.upload_manifest_file( 2555 manifest, metadataManifestPath, datasetId, restrict, component_name 2556 ) 2557 2558 # Set annotations for the file manifest. 2559 manifest_annotations = self.format_manifest_annotations( 2560 manifest, manifest_synapse_file_id 2561 ) 2562 file_manifest_annoations = self.syn.set_annotations(manifest_annotations) 2563 manifest_entity = self.synapse_entity_tracker.get( 2564 synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False 2565 ) 2566 manifest_entity.annotations = file_manifest_annoations 2567 manifest_entity.etag = file_manifest_annoations.etag 2568 logger.info("Associated manifest file with dataset on Synapse.") 2569 2570 # Update manifest Synapse table with new entity id column. 2571 manifest_synapse_table_id, manifest, table_manifest = self.uploadDB( 2572 dmge=dmge, 2573 manifest=manifest, 2574 datasetId=datasetId, 2575 table_name=table_name, 2576 restrict=restrict, 2577 table_manipulation="update", 2578 table_column_names=table_column_names, 2579 ) 2580 2581 # Set annotations for the table manifest 2582 manifest_annotations = self.format_manifest_annotations( 2583 manifest, manifest_synapse_table_id 2584 ) 2585 table_manifest_annotations = self.syn.set_annotations(manifest_annotations) 2586 manifest_entity = self.synapse_entity_tracker.get( 2587 synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False 2588 ) 2589 manifest_entity.annotations = table_manifest_annotations 2590 manifest_entity.etag = table_manifest_annotations.etag 2591 return manifest_synapse_file_id
Upload manifest to Synapse as a table and CSV with entities.
Arguments:
- dmge: DataModelGraphExplorer object
- manifest (pd.DataFrame): loaded df containing user supplied data.
- metadataManifestPath: path to csv containing a validated metadata manifest.
- datasetId (str): synapse ID of folder containing the dataset
- table_name (str): Generated to name the table being uploaded.
- component_name (str): Name of the component manifest that is currently being uploaded.
- restrict (bool): Flag for censored data.
- manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
- hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
- table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
- table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
- annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
- file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
Return:
manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2593 @tracer.start_as_current_span("SynapseStorage::associateMetadataWithFiles") 2594 def associateMetadataWithFiles( 2595 self, 2596 dmge: DataModelGraphExplorer, 2597 metadataManifestPath: str, 2598 datasetId: str, 2599 manifest_record_type: str = "table_file_and_entities", 2600 hideBlanks: bool = False, 2601 restrict_manifest=False, 2602 table_manipulation: str = "replace", 2603 table_column_names: str = "class_label", 2604 annotation_keys: str = "class_label", 2605 file_annotations_upload: bool = True, 2606 ) -> str: 2607 """Associate metadata with files in a storage dataset already on Synapse. 2608 Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file. 2609 2610 If this is a new manifest there could be no Synapse entities associated with the rows of this manifest 2611 this may be due to data type (e.g. clinical data) being tabular 2612 and not requiring files; to utilize uniform interfaces downstream 2613 (i.e. fileviews), a Synapse entity (a folder) is created for each row 2614 and an entity column is added to the manifest containing the resulting 2615 entity IDs; a table is also created at present as an additional interface 2616 for downstream query and interaction with the data. 2617 2618 Args: 2619 dmge: DataModelGraphExplorer Object 2620 metadataManifestPath: path to csv containing a validated metadata manifest. 2621 The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type. 2622 Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item. 2623 In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file. 2624 datasetId: synapse ID of folder containing the dataset 2625 manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination. 2626 hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false. 2627 restrict_manifest (bool): Default is false. Flag for censored data. 2628 table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'. 2629 table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display 2630 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2631 display label formatting. 2632 annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display 2633 name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain 2634 display label formatting while ensuring the label is formatted properly for Synapse annotations. 2635 Returns: 2636 manifest_synapse_file_id: SynID of manifest csv uploaded to synapse. 2637 """ 2638 # Read new manifest CSV: 2639 manifest = self._read_manifest(metadataManifestPath) 2640 manifest = self._add_id_columns_to_manifest(manifest, dmge) 2641 2642 table_name, component_name = self._generate_table_name(manifest) 2643 2644 # Upload manifest to synapse based on user input (manifest_record_type) 2645 if manifest_record_type == "file_only": 2646 manifest_synapse_file_id = self.upload_manifest_as_csv( 2647 dmge=dmge, 2648 manifest=manifest, 2649 metadataManifestPath=metadataManifestPath, 2650 datasetId=datasetId, 2651 restrict=restrict_manifest, 2652 hideBlanks=hideBlanks, 2653 manifest_record_type=manifest_record_type, 2654 component_name=component_name, 2655 annotation_keys=annotation_keys, 2656 file_annotations_upload=file_annotations_upload, 2657 ) 2658 elif manifest_record_type == "table_and_file": 2659 manifest_synapse_file_id = self.upload_manifest_as_table( 2660 dmge=dmge, 2661 manifest=manifest, 2662 metadataManifestPath=metadataManifestPath, 2663 datasetId=datasetId, 2664 table_name=table_name, 2665 component_name=component_name, 2666 restrict=restrict_manifest, 2667 hideBlanks=hideBlanks, 2668 manifest_record_type=manifest_record_type, 2669 table_manipulation=table_manipulation, 2670 table_column_names=table_column_names, 2671 annotation_keys=annotation_keys, 2672 file_annotations_upload=file_annotations_upload, 2673 ) 2674 elif manifest_record_type == "file_and_entities": 2675 manifest_synapse_file_id = self.upload_manifest_as_csv( 2676 dmge=dmge, 2677 manifest=manifest, 2678 metadataManifestPath=metadataManifestPath, 2679 datasetId=datasetId, 2680 restrict=restrict_manifest, 2681 hideBlanks=hideBlanks, 2682 manifest_record_type=manifest_record_type, 2683 component_name=component_name, 2684 annotation_keys=annotation_keys, 2685 file_annotations_upload=file_annotations_upload, 2686 ) 2687 elif manifest_record_type == "table_file_and_entities": 2688 manifest_synapse_file_id = self.upload_manifest_combo( 2689 dmge=dmge, 2690 manifest=manifest, 2691 metadataManifestPath=metadataManifestPath, 2692 datasetId=datasetId, 2693 table_name=table_name, 2694 component_name=component_name, 2695 restrict=restrict_manifest, 2696 hideBlanks=hideBlanks, 2697 manifest_record_type=manifest_record_type, 2698 table_manipulation=table_manipulation, 2699 table_column_names=table_column_names, 2700 annotation_keys=annotation_keys, 2701 file_annotations_upload=file_annotations_upload, 2702 ) 2703 else: 2704 raise ValueError("Please enter a valid manifest_record_type.") 2705 return manifest_synapse_file_id
Associate metadata with files in a storage dataset already on Synapse. Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file.
If this is a new manifest there could be no Synapse entities associated with the rows of this manifest this may be due to data type (e.g. clinical data) being tabular and not requiring files; to utilize uniform interfaces downstream (i.e. fileviews), a Synapse entity (a folder) is created for each row and an entity column is added to the manifest containing the resulting entity IDs; a table is also created at present as an additional interface for downstream query and interaction with the data.
Arguments:
- dmge: DataModelGraphExplorer Object
- metadataManifestPath: path to csv containing a validated metadata manifest.
- The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type.
- Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item.
- In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file.
- datasetId: synapse ID of folder containing the dataset
- manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination.
- hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
- restrict_manifest (bool): Default is false. Flag for censored data.
- table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
- table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
- annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
Returns:
manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2707 def getTableAnnotations(self, table_id: str): 2708 """Generate dictionary of annotations for the given Synapse file. 2709 Synapse returns all custom annotations as lists since they 2710 can contain multiple values. In all cases, the values will 2711 be converted into strings and concatenated with ", ". 2712 2713 Args: 2714 fileId (str): Synapse ID for dataset file. 2715 2716 Returns: 2717 dict: Annotations as comma-separated strings. 2718 """ 2719 try: 2720 entity = self.synapse_entity_tracker.get( 2721 synapse_id=table_id, syn=self.syn, download_file=False 2722 ) 2723 is_table = entity.concreteType.endswith(".TableEntity") 2724 annotations_raw = entity.annotations 2725 except SynapseHTTPError: 2726 # If an error occurs with retrieving entity, skip it 2727 # This could be caused by a temporary file view that 2728 # was deleted since its ID was retrieved 2729 is_file, is_table = False, False 2730 2731 # Skip anything that isn't a file or folder 2732 if not (is_table): 2733 return None 2734 2735 annotations = self.getEntityAnnotations(table_id, entity, annotations_raw) 2736 2737 return annotations
Generate dictionary of annotations for the given Synapse file. Synapse returns all custom annotations as lists since they can contain multiple values. In all cases, the values will be converted into strings and concatenated with ", ".
Arguments:
- fileId (str): Synapse ID for dataset file.
Returns:
dict: Annotations as comma-separated strings.
2739 def getFileAnnotations(self, fileId: str) -> Dict[str, str]: 2740 """Generate dictionary of annotations for the given Synapse file. 2741 Synapse returns all custom annotations as lists since they 2742 can contain multiple values. In all cases, the values will 2743 be converted into strings and concatenated with ", ". 2744 2745 Args: 2746 fileId (str): Synapse ID for dataset file. 2747 2748 Returns: 2749 dict: Annotations as comma-separated strings. 2750 """ 2751 2752 # Get entity metadata, including annotations 2753 try: 2754 entity = self.synapse_entity_tracker.get( 2755 synapse_id=fileId, syn=self.syn, download_file=False 2756 ) 2757 is_file = entity.concreteType.endswith(".FileEntity") 2758 is_folder = entity.concreteType.endswith(".Folder") 2759 annotations_raw = entity.annotations 2760 except SynapseHTTPError: 2761 # If an error occurs with retrieving entity, skip it 2762 # This could be caused by a temporary file view that 2763 # was deleted since its ID was retrieved 2764 is_file, is_folder = False, False 2765 2766 # Skip anything that isn't a file or folder 2767 if not (is_file or is_folder): 2768 return None 2769 2770 annotations = self.getEntityAnnotations(fileId, entity, annotations_raw) 2771 2772 return annotations
Generate dictionary of annotations for the given Synapse file. Synapse returns all custom annotations as lists since they can contain multiple values. In all cases, the values will be converted into strings and concatenated with ", ".
Arguments:
- fileId (str): Synapse ID for dataset file.
Returns:
dict: Annotations as comma-separated strings.
2774 def getEntityAnnotations(self, fileId, entity, annotations_raw): 2775 # Extract annotations from their lists and stringify. For example: 2776 # {'YearofBirth': [1980], 'author': ['bruno', 'milen', 'sujay']} 2777 annotations = dict() 2778 for key, vals in annotations_raw.items(): 2779 if isinstance(vals, list) and len(vals) == 1: 2780 annotations[key] = str(vals[0]) 2781 else: 2782 annotations[key] = ", ".join(str(v) for v in vals) 2783 2784 # Add the file entity ID and eTag, which weren't lists 2785 assert fileId == entity.id, ( 2786 "For some reason, the Synapse ID in the response doesn't match" 2787 "the Synapse ID sent in the request (via synapseclient)." 2788 ) 2789 annotations["entityId"] = fileId 2790 annotations["eTag"] = entity.etag 2791 2792 return annotations
2794 def getDatasetAnnotations( 2795 self, datasetId: str, fill_na: bool = True, force_batch: bool = False 2796 ) -> pd.DataFrame: 2797 """Generate table for annotations across all files in given dataset. 2798 2799 Args: 2800 datasetId (str): Synapse ID for dataset folder. 2801 fill_na (bool): Whether to replace missing values with 2802 blank strings. 2803 force_batch (bool): Whether to force the function to use 2804 the batch mode, which uses a file view to retrieve 2805 annotations for a given dataset. Default to False 2806 unless there are more than 50 files in the dataset. 2807 2808 Returns: 2809 pd.DataFrame: Table of annotations. 2810 """ 2811 # Get all files in given dataset 2812 dataset_files = self.getFilesInStorageDataset(datasetId) 2813 2814 # if there are no dataset files, there are no annotations 2815 # return None 2816 if not dataset_files: 2817 return pd.DataFrame() 2818 2819 dataset_files_map = dict(dataset_files) 2820 dataset_file_ids, _ = list(zip(*dataset_files)) 2821 2822 # Get annotations for each file from Step 1 2823 # Batch mode 2824 try_batch = len(dataset_files) >= 50 or force_batch 2825 if try_batch: 2826 try: 2827 logger.info("Trying batch mode for retrieving Synapse annotations") 2828 table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids) 2829 except (SynapseAuthenticationError, SynapseHTTPError, ValueError): 2830 logger.info( 2831 f"Unable to create a temporary file view bound to {datasetId}. " 2832 "Defaulting to slower iterative retrieval of annotations." 2833 ) 2834 # Default to the slower non-batch method 2835 logger.info("Batch mode failed (probably due to permission error)") 2836 try_batch = False 2837 2838 # Non-batch mode 2839 if not try_batch: 2840 logger.info("Using slower (non-batch) sequential mode") 2841 records = [self.getFileAnnotations(i) for i in dataset_file_ids] 2842 # Remove any annotations for non-file/folders (stored as None) 2843 records = filter(None, records) 2844 table = pd.DataFrame.from_records(records) 2845 2846 # Add filenames for the files that "survived" annotation retrieval 2847 filenames = [dataset_files_map[i] for i in table["entityId"]] 2848 2849 if "Filename" not in table.columns: 2850 table.insert(0, "Filename", filenames) 2851 2852 # Ensure that entityId and eTag are at the end 2853 entity_ids = table.pop("entityId") 2854 etags = table.pop("eTag") 2855 table.insert(len(table.columns), "entityId", entity_ids) 2856 table.insert(len(table.columns), "eTag", etags) 2857 2858 # Missing values are filled in with empty strings for Google Sheets 2859 if fill_na: 2860 table.fillna("", inplace=True) 2861 2862 # Force all values as strings 2863 return table.astype(str)
Generate table for annotations across all files in given dataset.
Arguments:
- datasetId (str): Synapse ID for dataset folder.
- fill_na (bool): Whether to replace missing values with blank strings.
- force_batch (bool): Whether to force the function to use the batch mode, which uses a file view to retrieve annotations for a given dataset. Default to False unless there are more than 50 files in the dataset.
Returns:
pd.DataFrame: Table of annotations.
2877 @tracer.start_as_current_span("SynapseStorage::getDatasetProject") 2878 @retry( 2879 stop=stop_after_attempt(5), 2880 wait=wait_chain( 2881 *[wait_fixed(10) for i in range(2)] 2882 + [wait_fixed(15) for i in range(2)] 2883 + [wait_fixed(20)] 2884 ), 2885 retry=retry_if_exception_type(LookupError), 2886 retry_error_callback=raise_final_error, 2887 ) 2888 def getDatasetProject(self, datasetId: str) -> str: 2889 """Get parent project for a given dataset ID. 2890 2891 Args: 2892 datasetId (str): Synapse entity ID (folder or project). 2893 2894 Raises: 2895 ValueError: Raised if Synapse ID cannot be retrieved 2896 by the user or if it doesn't appear in the file view. 2897 2898 Returns: 2899 str: The Synapse ID for the parent project. 2900 """ 2901 2902 # Subset main file view 2903 dataset_index = self.storageFileviewTable["id"] == datasetId 2904 dataset_row = self.storageFileviewTable[dataset_index] 2905 2906 # re-query if no datasets found 2907 if dataset_row.empty: 2908 sleep(5) 2909 self.query_fileview(force_requery=True) 2910 # Subset main file view 2911 dataset_index = self.storageFileviewTable["id"] == datasetId 2912 dataset_row = self.storageFileviewTable[dataset_index] 2913 2914 # Return `projectId` for given row if only one found 2915 if len(dataset_row) == 1: 2916 dataset_project = dataset_row["projectId"].values[0] 2917 return dataset_project 2918 2919 # Otherwise, check if already project itself 2920 try: 2921 syn_object = self.synapse_entity_tracker.get( 2922 synapse_id=datasetId, syn=self.syn, download_file=False 2923 ) 2924 if syn_object.properties["concreteType"].endswith("Project"): 2925 return datasetId 2926 except SynapseHTTPError: 2927 raise PermissionError( 2928 f"The given dataset ({datasetId}) isn't accessible with this " 2929 "user. This might be caused by a typo in the dataset Synapse ID." 2930 ) 2931 2932 # If not, then assume dataset not in file view 2933 raise LookupError( 2934 f"The given dataset ({datasetId}) doesn't appear in the " 2935 f"configured file view ({self.storageFileview}). This might " 2936 "mean that the file view's scope needs to be updated." 2937 )
Get parent project for a given dataset ID.
Arguments:
- datasetId (str): Synapse entity ID (folder or project).
Raises:
- ValueError: Raised if Synapse ID cannot be retrieved
- by the user or if it doesn't appear in the file view.
Returns:
str: The Synapse ID for the parent project.
2939 def getDatasetAnnotationsBatch( 2940 self, datasetId: str, dataset_file_ids: Sequence[str] = None 2941 ) -> pd.DataFrame: 2942 """Generate table for annotations across all files in given dataset. 2943 This function uses a temporary file view to generate a table 2944 instead of iteratively querying for individual entity annotations. 2945 This function is expected to run much faster than 2946 `self.getDatasetAnnotationsBatch` on large datasets. 2947 2948 Args: 2949 datasetId (str): Synapse ID for dataset folder. 2950 dataset_file_ids (Sequence[str]): List of Synapse IDs 2951 for dataset files/folders used to subset the table. 2952 2953 Returns: 2954 pd.DataFrame: Table of annotations. 2955 """ 2956 # Create data frame from annotations file view 2957 with DatasetFileView(datasetId, self.syn) as fileview: 2958 table = fileview.query() 2959 2960 if dataset_file_ids: 2961 table = table.loc[table.index.intersection(dataset_file_ids)] 2962 2963 table = table.reset_index(drop=True) 2964 2965 return table
Generate table for annotations across all files in given dataset.
This function uses a temporary file view to generate a table
instead of iteratively querying for individual entity annotations.
This function is expected to run much faster than
self.getDatasetAnnotationsBatch
on large datasets.
Arguments:
- datasetId (str): Synapse ID for dataset folder.
- dataset_file_ids (Sequence[str]): List of Synapse IDs for dataset files/folders used to subset the table.
Returns:
pd.DataFrame: Table of annotations.
2978class TableOperations: 2979 """ 2980 Object to hold functions for various table operations specific to the Synapse Asset Store. 2981 2982 Currently implement operations are: 2983 createTable: upload a manifest as a new table when none exist 2984 replaceTable: replace a metadata in a table from one manifest with metadata from another manifest 2985 updateTable: add a column to a table that already exists on synapse 2986 2987 Operations currently in development are: 2988 upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest 2989 """ 2990 2991 def __init__( 2992 self, 2993 synStore: SynapseStorage, 2994 tableToLoad: pd.DataFrame = None, 2995 tableName: str = None, 2996 datasetId: str = None, 2997 existingTableId: str = None, 2998 restrict: bool = False, 2999 synapse_entity_tracker: SynapseEntityTracker = None, 3000 ): 3001 """ 3002 Class governing table operations (creation, replacement, upserts, updates) in schematic 3003 3004 tableToLoad: manifest formatted appropriately for the table 3005 tableName: name of the table to be uploaded 3006 datasetId: synID of the dataset for the manifest 3007 existingTableId: synId of the table currently exising on synapse (if there is one) 3008 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 3009 synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities 3010 3011 """ 3012 self.synStore = synStore 3013 self.tableToLoad = tableToLoad 3014 self.tableName = tableName 3015 self.datasetId = datasetId 3016 self.existingTableId = existingTableId 3017 self.restrict = restrict 3018 self.synapse_entity_tracker = synapse_entity_tracker or SynapseEntityTracker() 3019 3020 @tracer.start_as_current_span("TableOperations::createTable") 3021 def createTable( 3022 self, 3023 columnTypeDict: dict = None, 3024 specifySchema: bool = True, 3025 ): 3026 """ 3027 Method to create a table from a metadata manifest and upload it to synapse 3028 3029 Args: 3030 columnTypeDict: dictionary schema for table columns: type, size, etc 3031 specifySchema: to specify a specific schema for the table format 3032 3033 Returns: 3034 table.schema.id: synID of the newly created table 3035 """ 3036 datasetEntity = self.synapse_entity_tracker.get( 3037 synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False 3038 ) 3039 datasetName = datasetEntity.name 3040 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3041 3042 if not self.tableName: 3043 self.tableName = datasetName + "table" 3044 datasetParentProject = self.synStore.getDatasetProject(self.datasetId) 3045 if specifySchema: 3046 if columnTypeDict == {}: 3047 logger.error("Did not provide a columnTypeDict.") 3048 # create list of columns: 3049 cols = [] 3050 for col in self.tableToLoad.columns: 3051 if col in table_schema_by_cname: 3052 col_type = table_schema_by_cname[col]["columnType"] 3053 max_size = ( 3054 table_schema_by_cname[col]["maximumSize"] 3055 if "maximumSize" in table_schema_by_cname[col].keys() 3056 else 100 3057 ) 3058 max_list_len = 250 3059 if max_size and max_list_len: 3060 cols.append( 3061 Column( 3062 name=col, 3063 columnType=col_type, 3064 maximumSize=max_size, 3065 maximumListLength=max_list_len, 3066 ) 3067 ) 3068 elif max_size: 3069 cols.append( 3070 Column(name=col, columnType=col_type, maximumSize=max_size) 3071 ) 3072 else: 3073 cols.append(Column(name=col, columnType=col_type)) 3074 else: 3075 # TODO add warning that the given col was not found and it's max size is set to 100 3076 cols.append(Column(name=col, columnType="STRING", maximumSize=100)) 3077 schema = Schema( 3078 name=self.tableName, columns=cols, parent=datasetParentProject 3079 ) 3080 table = Table(schema, self.tableToLoad) 3081 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3082 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3083 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3084 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3085 return table.schema.id 3086 else: 3087 # For just uploading the tables to synapse using default 3088 # column types. 3089 table = build_table(self.tableName, datasetParentProject, self.tableToLoad) 3090 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3091 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3092 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3093 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3094 return table.schema.id 3095 3096 @tracer.start_as_current_span("TableOperations::replaceTable") 3097 def replaceTable( 3098 self, 3099 specifySchema: bool = True, 3100 columnTypeDict: dict = None, 3101 ): 3102 """ 3103 Method to replace an existing table on synapse with metadata from a new manifest 3104 3105 Args: 3106 specifySchema: to infer a schema for the table format 3107 columnTypeDict: dictionary schema for table columns: type, size, etc 3108 3109 Returns: 3110 existingTableId: synID of the already existing table that had its metadata replaced 3111 """ 3112 datasetEntity = self.synapse_entity_tracker.get( 3113 synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False 3114 ) 3115 3116 datasetName = datasetEntity.name 3117 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3118 existing_table, existing_results = self.synStore.get_synapse_table( 3119 self.existingTableId 3120 ) 3121 # remove rows 3122 self.synStore.syn.delete(existing_results) 3123 # Data changes such as removing all rows causes the eTag to change. 3124 self.synapse_entity_tracker.remove(synapse_id=self.existingTableId) 3125 # wait for row deletion to finish on synapse before getting empty table 3126 sleep(10) 3127 3128 # removes all current columns 3129 current_table = self.synapse_entity_tracker.get( 3130 synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False 3131 ) 3132 3133 current_columns = self.synStore.syn.getTableColumns(current_table) 3134 for col in current_columns: 3135 current_table.removeColumn(col) 3136 3137 if not self.tableName: 3138 self.tableName = datasetName + "table" 3139 3140 # Process columns according to manifest entries 3141 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3142 datasetParentProject = self.synStore.getDatasetProject(self.datasetId) 3143 if specifySchema: 3144 if columnTypeDict == {}: 3145 logger.error("Did not provide a columnTypeDict.") 3146 # create list of columns: 3147 cols = [] 3148 3149 for col in self.tableToLoad.columns: 3150 if col in table_schema_by_cname: 3151 col_type = table_schema_by_cname[col]["columnType"] 3152 max_size = ( 3153 table_schema_by_cname[col]["maximumSize"] 3154 if "maximumSize" in table_schema_by_cname[col].keys() 3155 else 100 3156 ) 3157 max_list_len = 250 3158 if max_size and max_list_len: 3159 cols.append( 3160 Column( 3161 name=col, 3162 columnType=col_type, 3163 maximumSize=max_size, 3164 maximumListLength=max_list_len, 3165 ) 3166 ) 3167 elif max_size: 3168 cols.append( 3169 Column(name=col, columnType=col_type, maximumSize=max_size) 3170 ) 3171 else: 3172 cols.append(Column(name=col, columnType=col_type)) 3173 else: 3174 # TODO add warning that the given col was not found and it's max size is set to 100 3175 cols.append(Column(name=col, columnType="STRING", maximumSize=100)) 3176 3177 # adds new columns to schema 3178 for col in cols: 3179 current_table.addColumn(col) 3180 table_result = self.synStore.syn.store( 3181 current_table, isRestricted=self.restrict 3182 ) 3183 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3184 # self.synapse_entity_tracker.add(synapse_id=table_result.schema.id, entity=table_result.schema) 3185 self.synapse_entity_tracker.remove(synapse_id=table_result.id) 3186 3187 # wait for synapse store to finish 3188 sleep(1) 3189 3190 # build schema and table from columns and store with necessary restrictions 3191 schema = Schema( 3192 name=self.tableName, columns=cols, parent=datasetParentProject 3193 ) 3194 schema.id = self.existingTableId 3195 table = Table(schema, self.tableToLoad, etag=existing_results.etag) 3196 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3197 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3198 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3199 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3200 else: 3201 logging.error("Must specify a schema for table replacements") 3202 3203 # remove system metadata from manifest 3204 existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True) 3205 return self.existingTableId 3206 3207 @tracer.start_as_current_span("TableOperations::_get_auth_token") 3208 def _get_auth_token( 3209 self, 3210 ): 3211 authtoken = None 3212 3213 # Get access token from environment variable if available 3214 # Primarily useful for testing environments, with other possible usefulness for containers 3215 env_access_token = os.getenv("SYNAPSE_ACCESS_TOKEN") 3216 if env_access_token: 3217 authtoken = env_access_token 3218 return authtoken 3219 3220 # Get token from authorization header 3221 # Primarily useful for API endpoint functionality 3222 if "Authorization" in self.synStore.syn.default_headers: 3223 authtoken = self.synStore.syn.default_headers["Authorization"].split( 3224 "Bearer " 3225 )[-1] 3226 return authtoken 3227 3228 # retrive credentials from synapse object 3229 # Primarily useful for local users, could only be stored here when a .synapseConfig file is used, but including to be safe 3230 synapse_object_creds = self.synStore.syn.credentials 3231 if hasattr(synapse_object_creds, "_token"): 3232 authtoken = synapse_object_creds.secret 3233 3234 # Try getting creds from .synapseConfig file if it exists 3235 # Primarily useful for local users. Seems to correlate with credentials stored in synaspe object when logged in 3236 if os.path.exists(CONFIG.synapse_configuration_path): 3237 config = get_config_file(CONFIG.synapse_configuration_path) 3238 3239 # check which credentials are provided in file 3240 if config.has_option("authentication", "authtoken"): 3241 authtoken = config.get("authentication", "authtoken") 3242 3243 # raise error if required credentials are not found 3244 if not authtoken: 3245 raise NameError( 3246 "authtoken credentials could not be found in the environment, synapse object, or the .synapseConfig file" 3247 ) 3248 3249 return authtoken 3250 3251 @tracer.start_as_current_span("TableOperations::upsertTable") 3252 def upsertTable(self, dmge: DataModelGraphExplorer): 3253 """ 3254 Method to upsert rows from a new manifest into an existing table on synapse 3255 For upsert functionality to work, primary keys must follow the naming convention of <componenet>_id 3256 `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality. 3257 Currently it is required to use -dl/--use_display_label with table upserts. 3258 3259 3260 Args: 3261 dmge: DataModelGraphExplorer instance 3262 3263 Returns: 3264 existingTableId: synID of the already existing table that had its metadata replaced 3265 """ 3266 3267 authtoken = self._get_auth_token() 3268 3269 synapseDB = SynapseDatabase( 3270 auth_token=authtoken, 3271 project_id=self.synStore.getDatasetProject(self.datasetId), 3272 syn=self.synStore.syn, 3273 synapse_entity_tracker=self.synapse_entity_tracker, 3274 ) 3275 3276 try: 3277 # Try performing upsert 3278 synapseDB.upsert_table_rows( 3279 table_name=self.tableName, data=self.tableToLoad 3280 ) 3281 except SynapseHTTPError as ex: 3282 # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload 3283 if "Id is not a valid column name or id" in str(ex): 3284 self._update_table_uuid_column(dmge) 3285 synapseDB.upsert_table_rows( 3286 table_name=self.tableName, data=self.tableToLoad 3287 ) 3288 # Raise if other error 3289 else: 3290 raise ex 3291 3292 return self.existingTableId 3293 3294 @tracer.start_as_current_span("TableOperations::_update_table_uuid_column") 3295 def _update_table_uuid_column( 3296 self, 3297 dmge: DataModelGraphExplorer, 3298 ) -> None: 3299 """Removes the `Uuid` column when present, and relpaces with an `Id` column 3300 Used to enable backwards compatability for manifests using the old `Uuid` convention 3301 3302 Args: 3303 dmge: DataModelGraphExplorer instance 3304 3305 Returns: 3306 None 3307 """ 3308 3309 # Get the columns of the schema 3310 schema = self.synapse_entity_tracker.get( 3311 synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False 3312 ) 3313 3314 cols = self.synStore.syn.getTableColumns(schema) 3315 3316 # Iterate through columns until `Uuid` column is found 3317 for col in cols: 3318 if col.name.lower() == "uuid": 3319 # See if schema has `Uuid` column specified 3320 try: 3321 uuid_col_in_schema = dmge.is_class_in_schema(col.name) 3322 except KeyError: 3323 uuid_col_in_schema = False 3324 3325 # If there is, then create a new `Id` column from scratch 3326 if uuid_col_in_schema: 3327 new_col = Column(columnType="STRING", maximumSize=64, name="Id") 3328 schema.addColumn(new_col) 3329 schema = self.synStore.syn.store(schema) 3330 # self.synapse_entity_tracker.add(synapse_id=schema.id, entity=schema) 3331 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3332 self.synapse_entity_tracker.remove(synapse_id=schema.id) 3333 # If there is not, then use the old `Uuid` column as a basis for the new `Id` column 3334 else: 3335 # Build ColumnModel that will be used for new column 3336 id_column = Column( 3337 name="Id", 3338 columnType="STRING", 3339 maximumSize=64, 3340 defaultValue=None, 3341 maximumListLength=1, 3342 ) 3343 new_col_response = self.synStore.syn.store(id_column) 3344 3345 # Define columnChange body 3346 columnChangeDict = { 3347 "concreteType": "org.sagebionetworks.repo.model.table.TableSchemaChangeRequest", 3348 "entityId": self.existingTableId, 3349 "changes": [ 3350 { 3351 "oldColumnId": col["id"], 3352 "newColumnId": new_col_response["id"], 3353 } 3354 ], 3355 } 3356 3357 self.synStore.syn._async_table_update( 3358 table=self.existingTableId, 3359 changes=[columnChangeDict], 3360 wait=False, 3361 ) 3362 break 3363 3364 return 3365 3366 @tracer.start_as_current_span("TableOperations::updateTable") 3367 def updateTable( 3368 self, 3369 update_col: str = "Id", 3370 ): 3371 """ 3372 Method to update an existing table with a new column 3373 3374 Args: 3375 updateCol: column to index the old and new tables on 3376 3377 Returns: 3378 existingTableId: synID of the already existing table that had its metadata replaced 3379 """ 3380 existing_table, existing_results = self.synStore.get_synapse_table( 3381 self.existingTableId 3382 ) 3383 3384 self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col) 3385 # store table with existing etag data and impose restrictions as appropriate 3386 table_result = self.synStore.syn.store( 3387 Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag), 3388 isRestricted=self.restrict, 3389 ) 3390 # We cannot store the Table to the `synapse_entity_tracker` because there is 3391 # not `Schema` on the table object. The above `.store()` function call would 3392 # also update the ETag of the entity within Synapse. Remove it from the tracker 3393 # and re-retrieve it later on if needed again. 3394 self.synapse_entity_tracker.remove(synapse_id=table_result.tableId) 3395 3396 return self.existingTableId
Object to hold functions for various table operations specific to the Synapse Asset Store.
Currently implement operations are: createTable: upload a manifest as a new table when none exist replaceTable: replace a metadata in a table from one manifest with metadata from another manifest updateTable: add a column to a table that already exists on synapse
Operations currently in development are: upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest
2991 def __init__( 2992 self, 2993 synStore: SynapseStorage, 2994 tableToLoad: pd.DataFrame = None, 2995 tableName: str = None, 2996 datasetId: str = None, 2997 existingTableId: str = None, 2998 restrict: bool = False, 2999 synapse_entity_tracker: SynapseEntityTracker = None, 3000 ): 3001 """ 3002 Class governing table operations (creation, replacement, upserts, updates) in schematic 3003 3004 tableToLoad: manifest formatted appropriately for the table 3005 tableName: name of the table to be uploaded 3006 datasetId: synID of the dataset for the manifest 3007 existingTableId: synId of the table currently exising on synapse (if there is one) 3008 restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions 3009 synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities 3010 3011 """ 3012 self.synStore = synStore 3013 self.tableToLoad = tableToLoad 3014 self.tableName = tableName 3015 self.datasetId = datasetId 3016 self.existingTableId = existingTableId 3017 self.restrict = restrict 3018 self.synapse_entity_tracker = synapse_entity_tracker or SynapseEntityTracker()
Class governing table operations (creation, replacement, upserts, updates) in schematic
tableToLoad: manifest formatted appropriately for the table tableName: name of the table to be uploaded datasetId: synID of the dataset for the manifest existingTableId: synId of the table currently exising on synapse (if there is one) restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
3020 @tracer.start_as_current_span("TableOperations::createTable") 3021 def createTable( 3022 self, 3023 columnTypeDict: dict = None, 3024 specifySchema: bool = True, 3025 ): 3026 """ 3027 Method to create a table from a metadata manifest and upload it to synapse 3028 3029 Args: 3030 columnTypeDict: dictionary schema for table columns: type, size, etc 3031 specifySchema: to specify a specific schema for the table format 3032 3033 Returns: 3034 table.schema.id: synID of the newly created table 3035 """ 3036 datasetEntity = self.synapse_entity_tracker.get( 3037 synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False 3038 ) 3039 datasetName = datasetEntity.name 3040 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3041 3042 if not self.tableName: 3043 self.tableName = datasetName + "table" 3044 datasetParentProject = self.synStore.getDatasetProject(self.datasetId) 3045 if specifySchema: 3046 if columnTypeDict == {}: 3047 logger.error("Did not provide a columnTypeDict.") 3048 # create list of columns: 3049 cols = [] 3050 for col in self.tableToLoad.columns: 3051 if col in table_schema_by_cname: 3052 col_type = table_schema_by_cname[col]["columnType"] 3053 max_size = ( 3054 table_schema_by_cname[col]["maximumSize"] 3055 if "maximumSize" in table_schema_by_cname[col].keys() 3056 else 100 3057 ) 3058 max_list_len = 250 3059 if max_size and max_list_len: 3060 cols.append( 3061 Column( 3062 name=col, 3063 columnType=col_type, 3064 maximumSize=max_size, 3065 maximumListLength=max_list_len, 3066 ) 3067 ) 3068 elif max_size: 3069 cols.append( 3070 Column(name=col, columnType=col_type, maximumSize=max_size) 3071 ) 3072 else: 3073 cols.append(Column(name=col, columnType=col_type)) 3074 else: 3075 # TODO add warning that the given col was not found and it's max size is set to 100 3076 cols.append(Column(name=col, columnType="STRING", maximumSize=100)) 3077 schema = Schema( 3078 name=self.tableName, columns=cols, parent=datasetParentProject 3079 ) 3080 table = Table(schema, self.tableToLoad) 3081 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3082 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3083 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3084 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3085 return table.schema.id 3086 else: 3087 # For just uploading the tables to synapse using default 3088 # column types. 3089 table = build_table(self.tableName, datasetParentProject, self.tableToLoad) 3090 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3091 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3092 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3093 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3094 return table.schema.id
Method to create a table from a metadata manifest and upload it to synapse
Arguments:
- columnTypeDict: dictionary schema for table columns: type, size, etc
- specifySchema: to specify a specific schema for the table format
Returns:
table.schema.id: synID of the newly created table
3096 @tracer.start_as_current_span("TableOperations::replaceTable") 3097 def replaceTable( 3098 self, 3099 specifySchema: bool = True, 3100 columnTypeDict: dict = None, 3101 ): 3102 """ 3103 Method to replace an existing table on synapse with metadata from a new manifest 3104 3105 Args: 3106 specifySchema: to infer a schema for the table format 3107 columnTypeDict: dictionary schema for table columns: type, size, etc 3108 3109 Returns: 3110 existingTableId: synID of the already existing table that had its metadata replaced 3111 """ 3112 datasetEntity = self.synapse_entity_tracker.get( 3113 synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False 3114 ) 3115 3116 datasetName = datasetEntity.name 3117 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3118 existing_table, existing_results = self.synStore.get_synapse_table( 3119 self.existingTableId 3120 ) 3121 # remove rows 3122 self.synStore.syn.delete(existing_results) 3123 # Data changes such as removing all rows causes the eTag to change. 3124 self.synapse_entity_tracker.remove(synapse_id=self.existingTableId) 3125 # wait for row deletion to finish on synapse before getting empty table 3126 sleep(10) 3127 3128 # removes all current columns 3129 current_table = self.synapse_entity_tracker.get( 3130 synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False 3131 ) 3132 3133 current_columns = self.synStore.syn.getTableColumns(current_table) 3134 for col in current_columns: 3135 current_table.removeColumn(col) 3136 3137 if not self.tableName: 3138 self.tableName = datasetName + "table" 3139 3140 # Process columns according to manifest entries 3141 table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict) 3142 datasetParentProject = self.synStore.getDatasetProject(self.datasetId) 3143 if specifySchema: 3144 if columnTypeDict == {}: 3145 logger.error("Did not provide a columnTypeDict.") 3146 # create list of columns: 3147 cols = [] 3148 3149 for col in self.tableToLoad.columns: 3150 if col in table_schema_by_cname: 3151 col_type = table_schema_by_cname[col]["columnType"] 3152 max_size = ( 3153 table_schema_by_cname[col]["maximumSize"] 3154 if "maximumSize" in table_schema_by_cname[col].keys() 3155 else 100 3156 ) 3157 max_list_len = 250 3158 if max_size and max_list_len: 3159 cols.append( 3160 Column( 3161 name=col, 3162 columnType=col_type, 3163 maximumSize=max_size, 3164 maximumListLength=max_list_len, 3165 ) 3166 ) 3167 elif max_size: 3168 cols.append( 3169 Column(name=col, columnType=col_type, maximumSize=max_size) 3170 ) 3171 else: 3172 cols.append(Column(name=col, columnType=col_type)) 3173 else: 3174 # TODO add warning that the given col was not found and it's max size is set to 100 3175 cols.append(Column(name=col, columnType="STRING", maximumSize=100)) 3176 3177 # adds new columns to schema 3178 for col in cols: 3179 current_table.addColumn(col) 3180 table_result = self.synStore.syn.store( 3181 current_table, isRestricted=self.restrict 3182 ) 3183 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3184 # self.synapse_entity_tracker.add(synapse_id=table_result.schema.id, entity=table_result.schema) 3185 self.synapse_entity_tracker.remove(synapse_id=table_result.id) 3186 3187 # wait for synapse store to finish 3188 sleep(1) 3189 3190 # build schema and table from columns and store with necessary restrictions 3191 schema = Schema( 3192 name=self.tableName, columns=cols, parent=datasetParentProject 3193 ) 3194 schema.id = self.existingTableId 3195 table = Table(schema, self.tableToLoad, etag=existing_results.etag) 3196 table = self.synStore.syn.store(table, isRestricted=self.restrict) 3197 # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved 3198 # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema) 3199 self.synapse_entity_tracker.remove(synapse_id=table.schema.id) 3200 else: 3201 logging.error("Must specify a schema for table replacements") 3202 3203 # remove system metadata from manifest 3204 existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True) 3205 return self.existingTableId
Method to replace an existing table on synapse with metadata from a new manifest
Arguments:
- specifySchema: to infer a schema for the table format
- columnTypeDict: dictionary schema for table columns: type, size, etc
Returns:
existingTableId: synID of the already existing table that had its metadata replaced
3251 @tracer.start_as_current_span("TableOperations::upsertTable") 3252 def upsertTable(self, dmge: DataModelGraphExplorer): 3253 """ 3254 Method to upsert rows from a new manifest into an existing table on synapse 3255 For upsert functionality to work, primary keys must follow the naming convention of <componenet>_id 3256 `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality. 3257 Currently it is required to use -dl/--use_display_label with table upserts. 3258 3259 3260 Args: 3261 dmge: DataModelGraphExplorer instance 3262 3263 Returns: 3264 existingTableId: synID of the already existing table that had its metadata replaced 3265 """ 3266 3267 authtoken = self._get_auth_token() 3268 3269 synapseDB = SynapseDatabase( 3270 auth_token=authtoken, 3271 project_id=self.synStore.getDatasetProject(self.datasetId), 3272 syn=self.synStore.syn, 3273 synapse_entity_tracker=self.synapse_entity_tracker, 3274 ) 3275 3276 try: 3277 # Try performing upsert 3278 synapseDB.upsert_table_rows( 3279 table_name=self.tableName, data=self.tableToLoad 3280 ) 3281 except SynapseHTTPError as ex: 3282 # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload 3283 if "Id is not a valid column name or id" in str(ex): 3284 self._update_table_uuid_column(dmge) 3285 synapseDB.upsert_table_rows( 3286 table_name=self.tableName, data=self.tableToLoad 3287 ) 3288 # Raise if other error 3289 else: 3290 raise ex 3291 3292 return self.existingTableId
Method to upsert rows from a new manifest into an existing table on synapse
For upsert functionality to work, primary keys must follow the naming convention of -tm upsert
should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality.
Currently it is required to use -dl/--use_display_label with table upserts.
Arguments:
- dmge: DataModelGraphExplorer instance
Returns:
existingTableId: synID of the already existing table that had its metadata replaced
3366 @tracer.start_as_current_span("TableOperations::updateTable") 3367 def updateTable( 3368 self, 3369 update_col: str = "Id", 3370 ): 3371 """ 3372 Method to update an existing table with a new column 3373 3374 Args: 3375 updateCol: column to index the old and new tables on 3376 3377 Returns: 3378 existingTableId: synID of the already existing table that had its metadata replaced 3379 """ 3380 existing_table, existing_results = self.synStore.get_synapse_table( 3381 self.existingTableId 3382 ) 3383 3384 self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col) 3385 # store table with existing etag data and impose restrictions as appropriate 3386 table_result = self.synStore.syn.store( 3387 Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag), 3388 isRestricted=self.restrict, 3389 ) 3390 # We cannot store the Table to the `synapse_entity_tracker` because there is 3391 # not `Schema` on the table object. The above `.store()` function call would 3392 # also update the ETag of the entity within Synapse. Remove it from the tracker 3393 # and re-retrieve it later on if needed again. 3394 self.synapse_entity_tracker.remove(synapse_id=table_result.tableId) 3395 3396 return self.existingTableId
Method to update an existing table with a new column
Arguments:
- updateCol: column to index the old and new tables on
Returns:
existingTableId: synID of the already existing table that had its metadata replaced
3399class DatasetFileView: 3400 """Helper class to create temporary dataset file views. 3401 This class can be used in conjunction with a 'with' statement. 3402 This will ensure that the file view is deleted automatically. 3403 See SynapseStorage.getDatasetAnnotationsBatch for example usage. 3404 """ 3405 3406 def __init__( 3407 self, 3408 datasetId: str, 3409 synapse: Synapse, 3410 name: str = None, 3411 temporary: bool = True, 3412 parentId: str = None, 3413 ) -> None: 3414 """Create a file view scoped to a dataset folder. 3415 3416 Args: 3417 datasetId (str): Synapse ID for a dataset folder/project. 3418 synapse (Synapse): Used for Synapse requests. 3419 name (str): Name of the file view (temporary or not). 3420 temporary (bool): Whether to delete the file view on exit 3421 of either a 'with' statement or Python entirely. 3422 parentId (str, optional): Synapse ID specifying where to 3423 store the file view. Defaults to datasetId. 3424 """ 3425 3426 self.datasetId = datasetId 3427 self.synapse = synapse 3428 self.is_temporary = temporary 3429 3430 if name is None: 3431 self.name = f"schematic annotation file view for {self.datasetId}" 3432 3433 if self.is_temporary: 3434 uid = secrets.token_urlsafe(5) 3435 self.name = f"{self.name} - UID {uid}" 3436 3437 # TODO: Allow a DCC admin to configure a "universal parent" 3438 # Such as a Synapse project writeable by everyone. 3439 self.parentId = datasetId if parentId is None else parentId 3440 3441 # TODO: Create local sharing setting to hide from everyone else 3442 view_schema = EntityViewSchema( 3443 name=self.name, 3444 parent=self.parentId, 3445 scopes=self.datasetId, 3446 includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER], 3447 addDefaultViewColumns=False, 3448 addAnnotationColumns=True, 3449 ) 3450 3451 # TODO: Handle failure due to insufficient permissions by 3452 # creating a temporary new project to store view 3453 self.view_schema = self.synapse.store(view_schema) 3454 3455 # These are filled in after calling `self.query()` 3456 self.results = None 3457 self.table = None 3458 3459 # Ensure deletion of the file view (last resort) 3460 if self.is_temporary: 3461 atexit.register(self.delete) 3462 3463 def __enter__(self): 3464 """Return file view when entering 'with' statement.""" 3465 return self 3466 3467 def __exit__(self, exc_type, exc_value, traceback): 3468 """Delete file view when exiting 'with' statement.""" 3469 if self.is_temporary: 3470 self.delete() 3471 3472 def delete(self): 3473 """Delete the file view on Synapse without deleting local table.""" 3474 if self.view_schema is not None: 3475 self.synapse.delete(self.view_schema) 3476 self.view_schema = None 3477 3478 def query(self, tidy=True, force=False): 3479 """Retrieve file view as a data frame (raw format sans index).""" 3480 if self.table is None or force: 3481 fileview_id = self.view_schema["id"] 3482 self.results = self.synapse.tableQuery(f"select * from {fileview_id}") 3483 self.table = self.results.asDataFrame( 3484 rowIdAndVersionInIndex=False, 3485 na_values=STR_NA_VALUES_FILTERED, 3486 keep_default_na=False, 3487 ) 3488 if tidy: 3489 self.tidy_table() 3490 return self.table 3491 3492 def tidy_table(self): 3493 """Convert raw file view data frame into more usable format.""" 3494 assert self.table is not None, "Must call `self.query()` first." 3495 self._fix_default_columns() 3496 self._fix_list_columns() 3497 self._fix_int_columns() 3498 return self.table 3499 3500 def _fix_default_columns(self): 3501 """Rename default columns to match schematic expectations.""" 3502 3503 # Drop ROW_VERSION column if present 3504 if "ROW_VERSION" in self.table: 3505 del self.table["ROW_VERSION"] 3506 3507 # Rename id column to entityId and set as data frame index 3508 if "ROW_ID" in self.table: 3509 self.table["entityId"] = "syn" + self.table["ROW_ID"].astype(str) 3510 self.table = self.table.set_index("entityId", drop=False) 3511 del self.table["ROW_ID"] 3512 3513 # Rename ROW_ETAG column to eTag and place at end of data frame 3514 if "ROW_ETAG" in self.table: 3515 row_etags = self.table.pop("ROW_ETAG") 3516 3517 # eTag column may already present if users annotated data without submitting manifest 3518 # we're only concerned with the new values and not the existing ones 3519 if "eTag" in self.table: 3520 del self.table["eTag"] 3521 3522 self.table.insert(len(self.table.columns), "eTag", row_etags) 3523 3524 return self.table 3525 3526 def _get_columns_of_type(self, types): 3527 """Helper function to get list of columns of a given type(s).""" 3528 matching_columns = [] 3529 for header in self.results.headers: 3530 if header.columnType in types: 3531 matching_columns.append(header.name) 3532 return matching_columns 3533 3534 def _fix_list_columns(self): 3535 """Fix formatting of list-columns.""" 3536 list_types = {"STRING_LIST", "INTEGER_LIST", "BOOLEAN_LIST"} 3537 list_columns = self._get_columns_of_type(list_types) 3538 for col in list_columns: 3539 self.table[col] = self.table[col].apply(lambda x: ", ".join(x)) 3540 return self.table 3541 3542 def _fix_int_columns(self): 3543 """Ensure that integer-columns are actually integers.""" 3544 int_columns = self._get_columns_of_type({"INTEGER"}) 3545 for col in int_columns: 3546 # Coercing to string because NaN is a floating point value 3547 # and cannot exist alongside integers in a column 3548 def to_int_fn(x): 3549 return "" if np.isnan(x) else str(int(x)) 3550 3551 self.table[col] = self.table[col].apply(to_int_fn) 3552 return self.table
Helper class to create temporary dataset file views. This class can be used in conjunction with a 'with' statement. This will ensure that the file view is deleted automatically. See SynapseStorage.getDatasetAnnotationsBatch for example usage.
3406 def __init__( 3407 self, 3408 datasetId: str, 3409 synapse: Synapse, 3410 name: str = None, 3411 temporary: bool = True, 3412 parentId: str = None, 3413 ) -> None: 3414 """Create a file view scoped to a dataset folder. 3415 3416 Args: 3417 datasetId (str): Synapse ID for a dataset folder/project. 3418 synapse (Synapse): Used for Synapse requests. 3419 name (str): Name of the file view (temporary or not). 3420 temporary (bool): Whether to delete the file view on exit 3421 of either a 'with' statement or Python entirely. 3422 parentId (str, optional): Synapse ID specifying where to 3423 store the file view. Defaults to datasetId. 3424 """ 3425 3426 self.datasetId = datasetId 3427 self.synapse = synapse 3428 self.is_temporary = temporary 3429 3430 if name is None: 3431 self.name = f"schematic annotation file view for {self.datasetId}" 3432 3433 if self.is_temporary: 3434 uid = secrets.token_urlsafe(5) 3435 self.name = f"{self.name} - UID {uid}" 3436 3437 # TODO: Allow a DCC admin to configure a "universal parent" 3438 # Such as a Synapse project writeable by everyone. 3439 self.parentId = datasetId if parentId is None else parentId 3440 3441 # TODO: Create local sharing setting to hide from everyone else 3442 view_schema = EntityViewSchema( 3443 name=self.name, 3444 parent=self.parentId, 3445 scopes=self.datasetId, 3446 includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER], 3447 addDefaultViewColumns=False, 3448 addAnnotationColumns=True, 3449 ) 3450 3451 # TODO: Handle failure due to insufficient permissions by 3452 # creating a temporary new project to store view 3453 self.view_schema = self.synapse.store(view_schema) 3454 3455 # These are filled in after calling `self.query()` 3456 self.results = None 3457 self.table = None 3458 3459 # Ensure deletion of the file view (last resort) 3460 if self.is_temporary: 3461 atexit.register(self.delete)
Create a file view scoped to a dataset folder.
Arguments:
- datasetId (str): Synapse ID for a dataset folder/project.
- synapse (Synapse): Used for Synapse requests.
- name (str): Name of the file view (temporary or not).
- temporary (bool): Whether to delete the file view on exit of either a 'with' statement or Python entirely.
- parentId (str, optional): Synapse ID specifying where to store the file view. Defaults to datasetId.
3472 def delete(self): 3473 """Delete the file view on Synapse without deleting local table.""" 3474 if self.view_schema is not None: 3475 self.synapse.delete(self.view_schema) 3476 self.view_schema = None
Delete the file view on Synapse without deleting local table.
3478 def query(self, tidy=True, force=False): 3479 """Retrieve file view as a data frame (raw format sans index).""" 3480 if self.table is None or force: 3481 fileview_id = self.view_schema["id"] 3482 self.results = self.synapse.tableQuery(f"select * from {fileview_id}") 3483 self.table = self.results.asDataFrame( 3484 rowIdAndVersionInIndex=False, 3485 na_values=STR_NA_VALUES_FILTERED, 3486 keep_default_na=False, 3487 ) 3488 if tidy: 3489 self.tidy_table() 3490 return self.table
Retrieve file view as a data frame (raw format sans index).
3492 def tidy_table(self): 3493 """Convert raw file view data frame into more usable format.""" 3494 assert self.table is not None, "Must call `self.query()` first." 3495 self._fix_default_columns() 3496 self._fix_list_columns() 3497 self._fix_int_columns() 3498 return self.table
Convert raw file view data frame into more usable format.