schematic.store.synapse

Synapse storage class

   1"""Synapse storage class"""
   2
   3import asyncio
   4import atexit
   5import logging
   6import os
   7import re
   8import secrets
   9import shutil
  10import time
  11import uuid  # used to generate unique names for entities
  12from copy import deepcopy
  13from dataclasses import dataclass, field
  14from time import sleep
  15
  16# allows specifying explicit variable types
  17from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
  18
  19import numpy as np
  20import pandas as pd
  21import synapseclient
  22from opentelemetry import trace
  23from synapseclient import Annotations as OldAnnotations
  24from synapseclient import (
  25    Column,
  26    EntityViewSchema,
  27    EntityViewType,
  28    File,
  29    Folder,
  30    Schema,
  31    Synapse,
  32    Table,
  33    as_table_columns,
  34)
  35from synapseclient.annotations import _convert_to_annotations_list
  36from synapseclient.api import get_config_file, get_entity_id_bundle2
  37from synapseclient.core.constants.concrete_types import PROJECT_ENTITY
  38from synapseclient.core.exceptions import (
  39    SynapseAuthenticationError,
  40    SynapseHTTPError,
  41    SynapseUnmetAccessRestrictions,
  42)
  43from synapseclient.models.annotations import Annotations
  44from synapseclient.table import CsvFileTable, Schema, build_table
  45from tenacity import (
  46    retry,
  47    retry_if_exception_type,
  48    stop_after_attempt,
  49    wait_chain,
  50    wait_fixed,
  51)
  52
  53from schematic.configuration.configuration import CONFIG
  54from schematic.exceptions import AccessCredentialsError
  55from schematic.schemas.data_model_graph import DataModelGraphExplorer
  56from schematic.store.base import BaseStorage
  57from schematic.store.database.synapse_database import SynapseDatabase
  58from schematic.store.synapse_tracker import SynapseEntityTracker
  59from schematic.utils.df_utils import (
  60    STR_NA_VALUES_FILTERED,
  61    col_in_dataframe,
  62    load_df,
  63    update_df,
  64)
  65
  66# entity_type_mapping, get_dir_size, create_temp_folder, check_synapse_cache_size, and clear_synapse_cache functions are used for AWS deployment
  67# Please do not remove these import statements
  68from schematic.utils.general import (
  69    check_synapse_cache_size,
  70    clear_synapse_cache,
  71    create_temp_folder,
  72    entity_type_mapping,
  73    get_dir_size,
  74)
  75from schematic.utils.io_utils import cleanup_temporary_storage
  76from schematic.utils.schema_utils import get_class_label_from_display_name
  77from schematic.utils.validate_utils import comma_separated_list_regex, rule_in_rule_list
  78
  79
  80logger = logging.getLogger("Synapse storage")
  81
  82tracer = trace.get_tracer("Schematic")
  83
  84
  85@dataclass
  86class ManifestDownload(object):
  87    """
  88    syn: an object of type synapseclient.
  89    manifest_id: id of a manifest
  90    synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
  91    """
  92
  93    syn: synapseclient.Synapse
  94    manifest_id: str
  95    synapse_entity_tracker: SynapseEntityTracker = field(
  96        default_factory=SynapseEntityTracker
  97    )
  98
  99    def _download_manifest_to_folder(self, use_temporary_folder: bool = True) -> File:
 100        """
 101        Try downloading a manifest to a specific folder (temporary or not). When the
 102        `use_temporary_folder` is set to True, the manifest will be downloaded to a
 103        temporary folder. This is useful for when the code is running as an API server
 104        where multiple requests are being made at the same time. This will prevent
 105        multiple requests from overwriting the same manifest file. When the
 106        `use_temporary_folder` is set to False, the manifest will be downloaded to the
 107        default manifest folder.
 108
 109        Args:
 110            use_temporary_folder: boolean argument indicating if a temporary folder
 111                should be used to store the manifest file. This is useful when running
 112                this code as an API server where multiple requests could be made at the
 113                same time. This is set to False when the code is being used from the
 114                CLI. Defaults to True.
 115
 116        Return:
 117            manifest_data: A Synapse file entity of the downloaded manifest
 118        """
 119        manifest_data = self.synapse_entity_tracker.get(
 120            synapse_id=self.manifest_id,
 121            syn=self.syn,
 122            download_file=False,
 123            retrieve_if_not_present=False,
 124        )
 125        current_span = trace.get_current_span()
 126        if (
 127            manifest_data
 128            and (file_handle := manifest_data.get("_file_handle", None))
 129            and current_span.is_recording()
 130        ):
 131            current_span.set_attribute(
 132                "schematic.manifest_size", file_handle.get("contentSize", 0)
 133            )
 134
 135        if manifest_data and manifest_data.path:
 136            return manifest_data
 137
 138        if "SECRETS_MANAGER_SECRETS" in os.environ:
 139            temporary_manifest_storage = "/var/tmp/temp_manifest_download"
 140            cleanup_temporary_storage(
 141                temporary_manifest_storage, time_delta_seconds=3600
 142            )
 143            # create a new directory to store manifest
 144            if not os.path.exists(temporary_manifest_storage):
 145                os.mkdir(temporary_manifest_storage)
 146            # create temporary folders for storing manifests
 147            download_location = create_temp_folder(
 148                path=temporary_manifest_storage,
 149                prefix=f"{self.manifest_id}-{time.time()}-",
 150            )
 151        else:
 152            if use_temporary_folder:
 153                download_location = create_temp_folder(
 154                    path=CONFIG.manifest_folder,
 155                    prefix=f"{self.manifest_id}-{time.time()}-",
 156                )
 157            else:
 158                download_location = CONFIG.manifest_folder
 159
 160        manifest_data = self.synapse_entity_tracker.get(
 161            synapse_id=self.manifest_id,
 162            syn=self.syn,
 163            download_file=True,
 164            retrieve_if_not_present=True,
 165            download_location=download_location,
 166        )
 167
 168        # This is doing a rename of the downloaded file. The reason this is important
 169        # is that if we are re-using a file that was previously downloaded, but the
 170        # file had been renamed. The file downloaded from the Synapse client is just
 171        # a direct copy of that renamed file. This code will set the name of the file
 172        # to the original name that was used to download the file. Note: An MD5 checksum
 173        # of the file will still be performed so if the file has changed, it will be
 174        # downloaded again.
 175        filename = manifest_data._file_handle.fileName
 176        if filename != os.path.basename(manifest_data.path):
 177            parent_folder = os.path.dirname(manifest_data.path)
 178            manifest_original_name_and_path = os.path.join(parent_folder, filename)
 179
 180            self.syn.cache.remove(
 181                file_handle_id=manifest_data.dataFileHandleId, path=manifest_data.path
 182            )
 183            os.rename(manifest_data.path, manifest_original_name_and_path)
 184            manifest_data.path = manifest_original_name_and_path
 185            self.syn.cache.add(
 186                file_handle_id=manifest_data.dataFileHandleId,
 187                path=manifest_original_name_and_path,
 188                md5=manifest_data._file_handle.contentMd5,
 189            )
 190
 191        return manifest_data
 192
 193    def _entity_type_checking(self) -> str:
 194        """
 195        check the entity type of the id that needs to be downloaded
 196        Return:
 197             if the entity type is wrong, raise an error
 198        """
 199        # check the type of entity
 200        entity_type = entity_type_mapping(
 201            syn=self.syn,
 202            entity_id=self.manifest_id,
 203            synapse_entity_tracker=self.synapse_entity_tracker,
 204        )
 205        if entity_type != "file":
 206            logger.error(
 207                f"You are using entity type: {entity_type}. Please provide a file ID"
 208            )
 209
 210    def download_manifest(
 211        self,
 212        newManifestName: str = "",
 213        manifest_df: pd.DataFrame = pd.DataFrame(),
 214        use_temporary_folder: bool = True,
 215    ) -> Union[str, File]:
 216        """
 217        Download a manifest based on a given manifest id.
 218        Args:
 219            newManifestName(optional): new name of a manifest that gets downloaded.
 220            manifest_df(optional): a dataframe containing name and id of manifests in a given asset view
 221        Return:
 222            manifest_data: synapse entity file object
 223        """
 224
 225        # enables retrying if user does not have access to uncensored manifest
 226        # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location
 227        manifest_data = ""
 228
 229        # check entity type
 230        self._entity_type_checking()
 231
 232        # download a manifest
 233        try:
 234            manifest_data = self._download_manifest_to_folder(
 235                use_temporary_folder=use_temporary_folder
 236            )
 237        except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError):
 238            # if there's an error getting an uncensored manifest, try getting the censored manifest
 239            if not manifest_df.empty:
 240                censored_regex = re.compile(".*censored.*")
 241                censored = manifest_df["name"].str.contains(censored_regex)
 242                new_manifest_id = manifest_df[censored]["id"][0]
 243                self.manifest_id = new_manifest_id
 244                try:
 245                    manifest_data = self._download_manifest_to_folder(
 246                        use_temporary_folder=use_temporary_folder
 247                    )
 248                except (
 249                    SynapseUnmetAccessRestrictions,
 250                    SynapseAuthenticationError,
 251                ) as e:
 252                    raise PermissionError(
 253                        "You don't have access to censored and uncensored manifests in this dataset."
 254                    ) from e
 255            else:
 256                logger.error(
 257                    f"You don't have access to the requested resource: {self.manifest_id}"
 258                )
 259
 260        if newManifestName and os.path.exists(manifest_data.get("path")):
 261            # Rename the file we just made to the new name
 262            new_manifest_filename = newManifestName + ".csv"
 263
 264            # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest.
 265            parent_folder = os.path.dirname(manifest_data.get("path"))
 266
 267            new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename)
 268
 269            # Copy file to new location. The purpose of using a copy instead of a rename
 270            # is to avoid any potential issues with the file being used in another
 271            # process. This avoids any potential race or code cocurrency conditions.
 272            shutil.copyfile(src=manifest_data["path"], dst=new_manifest_path_name)
 273
 274            # Adding this to cache will allow us to re-use the already downloaded
 275            # manifest file for up to 1 hour.
 276            self.syn.cache.add(
 277                file_handle_id=manifest_data.dataFileHandleId,
 278                path=new_manifest_path_name,
 279                md5=manifest_data._file_handle.contentMd5,
 280            )
 281
 282            # Update file names/paths in manifest_data
 283            manifest_data["name"] = new_manifest_filename
 284            manifest_data["filename"] = new_manifest_filename
 285            manifest_data["path"] = new_manifest_path_name
 286
 287        return manifest_data
 288
 289
 290class SynapseStorage(BaseStorage):
 291    """Implementation of Storage interface for datasets/files stored on Synapse.
 292    Provides utilities to list files in a specific project; update files annotations, create fileviews, etc.
 293
 294    TODO: Need to define the interface and rename and/or refactor some of the methods below.
 295    """
 296
 297    @tracer.start_as_current_span("SynapseStorage::__init__")
 298    def __init__(
 299        self,
 300        token: Optional[str] = None,  # optional parameter retrieved from browser cookie
 301        access_token: Optional[str] = None,
 302        project_scope: Optional[list] = None,
 303        synapse_cache_path: Optional[str] = None,
 304        perform_query: Optional[bool] = True,
 305        columns: Optional[list] = None,
 306        where_clauses: Optional[list] = None,
 307    ) -> None:
 308        """Initializes a SynapseStorage object.
 309
 310        Args:
 311            token (Optional[str], optional):
 312              Optional token parameter as found in browser cookie upon login to synapse.
 313              Defaults to None.
 314            access_token (Optional[list], optional):
 315              Optional access token (personal or oauth).
 316              Defaults to None.
 317            project_scope (Optional[list], optional): Defaults to None.
 318            synapse_cache_path (Optional[str], optional):
 319              Location of synapse cache.
 320              Defaults to None.
 321        TODO:
 322            Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how `query_fileview` is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands.
 323        """
 324        self.syn = self.login(synapse_cache_path, access_token)
 325        self.project_scope = project_scope
 326        self.storageFileview = CONFIG.synapse_master_fileview_id
 327        self.manifest = CONFIG.synapse_manifest_basename
 328        self.root_synapse_cache = self.syn.cache.cache_root_dir
 329        self.synapse_entity_tracker = SynapseEntityTracker()
 330        if perform_query:
 331            self.query_fileview(columns=columns, where_clauses=where_clauses)
 332
 333    # TODO: When moving this over to a regular cron-job the following logic should be
 334    # out of `manifest_download`:
 335    # if "SECRETS_MANAGER_SECRETS" in os.environ:
 336    #     temporary_manifest_storage = "/var/tmp/temp_manifest_download"
 337    #     cleanup_temporary_storage(temporary_manifest_storage, time_delta_seconds=3600)
 338    @tracer.start_as_current_span("SynapseStorage::_purge_synapse_cache")
 339    def _purge_synapse_cache(
 340        self, maximum_storage_allowed_cache_gb: int = 1, minute_buffer: int = 15
 341    ) -> None:
 342        """
 343        Purge synapse cache if it exceeds a certain size. Default to 1GB.
 344        Args:
 345            maximum_storage_allowed_cache_gb (int): the maximum storage allowed
 346              before purging cache. Default is 1 GB.
 347            minute_buffer (int): All files created this amount of time or older will be deleted
 348        """
 349        # try clearing the cache
 350        # scan a directory and check size of files
 351        if os.path.exists(self.root_synapse_cache):
 352            maximum_storage_allowed_cache_bytes = maximum_storage_allowed_cache_gb * (
 353                1024**3
 354            )
 355            nbytes = get_dir_size(self.root_synapse_cache)
 356            dir_size_bytes = check_synapse_cache_size(directory=self.root_synapse_cache)
 357            # if 1 GB has already been taken, purge cache before 15 min
 358            if dir_size_bytes >= maximum_storage_allowed_cache_bytes:
 359                num_of_deleted_files = clear_synapse_cache(
 360                    self.syn.cache, minutes=minute_buffer
 361                )
 362                logger.info(
 363                    f"{num_of_deleted_files}  files have been deleted from {self.root_synapse_cache}"
 364                )
 365            else:
 366                # on AWS, OS takes around 14-17% of our ephemeral storage (20GiB)
 367                # instead of guessing how much space that we left, print out .synapseCache here
 368                logger.info(f"the total size of .synapseCache is: {nbytes} bytes")
 369
 370    @tracer.start_as_current_span("SynapseStorage::query_fileview")
 371    def query_fileview(
 372        self,
 373        columns: Optional[list] = None,
 374        where_clauses: Optional[list] = None,
 375        force_requery: Optional[bool] = False,
 376    ) -> None:
 377        """
 378        Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute.
 379        Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes.
 380        Args:
 381            columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
 382            where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
 383            force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False.
 384        """
 385        self._purge_synapse_cache()
 386
 387        # Initialize to assume that the new fileview query will be different from what may already be stored. Initializes to True because generally one will not have already been performed
 388        self.new_query_different = True
 389
 390        # If a query has already been performed, store the query
 391        previous_query_built = hasattr(self, "fileview_query")
 392        if previous_query_built:
 393            previous_query = self.fileview_query
 394
 395        # Build a query with the current given parameters and check to see if it is different from the previous
 396        self._build_query(columns=columns, where_clauses=where_clauses)
 397        if previous_query_built:
 398            self.new_query_different = self.fileview_query != previous_query
 399
 400        # Only perform the query if it is different from the previous query or we are forcing new results to be retrieved
 401        if self.new_query_different or force_requery:
 402            try:
 403                self.storageFileviewTable = self.syn.tableQuery(
 404                    query=self.fileview_query,
 405                ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
 406            except SynapseHTTPError as exc:
 407                exception_text = str(exc)
 408                if "Unknown column path" in exception_text:
 409                    raise ValueError(
 410                        "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation."
 411                    )
 412                elif "Unknown column" in exception_text:
 413                    missing_column = exception_text.split("Unknown column ")[-1]
 414                    raise ValueError(
 415                        f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview."
 416                    )
 417                else:
 418                    raise AccessCredentialsError(self.storageFileview)
 419
 420    @staticmethod
 421    def build_clause_from_dataset_id(
 422        dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None
 423    ) -> str:
 424        """
 425        Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized.
 426        Args:
 427            dataset_id: Synapse ID of a dataset that should be used to limit the query
 428            dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query
 429        Returns:
 430            clause for the query or an empty string if no dataset ID is provided
 431        """
 432        # Calling this method without specifying synIDs will complete but will not scope the view
 433        if (not dataset_id) and (not dataset_folder_list):
 434            return ""
 435
 436        # This will be used to gather files under a dataset recursively with a fileview query instead of walking
 437        if dataset_folder_list:
 438            search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list)
 439            return f"parentId IN ({search_folders})"
 440
 441        # `dataset_id` should be provided when all files are stored directly under the dataset folder
 442        return f"parentId='{dataset_id}'"
 443
 444    def _build_query(
 445        self, columns: Optional[list] = None, where_clauses: Optional[list] = None
 446    ):
 447        """
 448        Method to build a query for Synapse FileViews
 449        Args:
 450            columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
 451            where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
 452            self.storageFileview (str): Synapse FileView ID
 453            self.project_scope (Optional[list], optional): List of project IDs to be used to scope the query. Defaults to None.
 454                Gets added to where_clauses, more included for backwards compatability and as a more user friendly way of subsetting the view in a simple way.
 455        """
 456        if columns is None:
 457            columns = []
 458        if where_clauses is None:
 459            where_clauses = []
 460
 461        if self.project_scope:
 462            project_scope_clause = f"projectId IN {tuple(self.project_scope + [''])}"
 463            where_clauses.append(project_scope_clause)
 464
 465        if where_clauses:
 466            where_clauses = " AND ".join(where_clauses)
 467            where_clauses = f"WHERE {where_clauses} ;"
 468        else:
 469            where_clauses = ";"
 470
 471        if columns:
 472            columns = ",".join(columns)
 473        else:
 474            columns = "*"
 475
 476        self.fileview_query = (
 477            f"SELECT {columns} FROM {self.storageFileview} {where_clauses}"
 478        )
 479
 480        return
 481
 482    @staticmethod
 483    @tracer.start_as_current_span("SynapseStorage::login")
 484    def login(
 485        synapse_cache_path: Optional[str] = None,
 486        access_token: Optional[str] = None,
 487    ) -> synapseclient.Synapse:
 488        """Login to Synapse
 489
 490        Args:
 491            access_token (Optional[str], optional): A synapse access token. Defaults to None.
 492            synapse_cache_path (Optional[str]): location of synapse cache
 493
 494        Raises:
 495            ValueError: If unable to loging with access token
 496
 497        Returns:
 498            synapseclient.Synapse: A Synapse object that is logged in
 499        """
 500        if not access_token:
 501            access_token = os.getenv("SYNAPSE_ACCESS_TOKEN")
 502
 503        # login using a token
 504        if access_token:
 505            try:
 506                syn = synapseclient.Synapse(
 507                    cache_root_dir=synapse_cache_path,
 508                    debug=False,
 509                    skip_checks=True,
 510                    cache_client=False,
 511                )
 512                syn.login(authToken=access_token, silent=True)
 513            except SynapseHTTPError as exc:
 514                raise ValueError(
 515                    "No access to resources. Please make sure that your token is correct"
 516                ) from exc
 517        else:
 518            # login using synapse credentials provided by user in .synapseConfig (default) file
 519            syn = synapseclient.Synapse(
 520                configPath=CONFIG.synapse_configuration_path,
 521                cache_root_dir=synapse_cache_path,
 522                debug=False,
 523                skip_checks=True,
 524                cache_client=False,
 525            )
 526            syn.login(silent=True)
 527
 528        # set user id attribute
 529        current_span = trace.get_current_span()
 530        if current_span.is_recording():
 531            current_span.set_attribute("user.id", syn.credentials.owner_id)
 532
 533        return syn
 534
 535    def missing_entity_handler(method):
 536        def wrapper(*args, **kwargs):
 537            try:
 538                return method(*args, **kwargs)
 539            except SynapseHTTPError as ex:
 540                str_message = str(ex).replace("\n", "")
 541                if "trash" in str_message or "does not exist" in str_message:
 542                    logging.warning(str_message)
 543                    return None
 544                else:
 545                    raise ex
 546
 547        return wrapper
 548
 549    def async_missing_entity_handler(method):
 550        """Decorator to handle missing entities in async methods."""
 551
 552        async def wrapper(*args: Any, **kwargs: Any) -> Any:
 553            try:
 554                return await method(*args, **kwargs)
 555            except SynapseHTTPError as ex:
 556                str_message = str(ex).replace("\n", "")
 557                if "trash" in str_message or "does not exist" in str_message:
 558                    logging.warning(str_message)
 559                    return None
 560                else:
 561                    raise ex
 562
 563        return wrapper
 564
 565    def getStorageFileviewTable(self):
 566        """Returns the storageFileviewTable obtained during initialization."""
 567        return self.storageFileviewTable
 568
 569    def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]:
 570        """Gets the paginated results of the REST call to Synapse to check what projects the current user has access to.
 571
 572        Args:
 573            currentUserId: synapse id for the user whose projects we want to get.
 574
 575        Returns:
 576            A dictionary with a next page token and the results.
 577        """
 578        all_results = self.syn.restGET(
 579            "/projects/user/{principalId}".format(principalId=currentUserId)
 580        )
 581
 582        while (
 583            "nextPageToken" in all_results
 584        ):  # iterate over next page token in results while there is any
 585            results_token = self.syn.restGET(
 586                "/projects/user/{principalId}?nextPageToken={nextPageToken}".format(
 587                    principalId=currentUserId,
 588                    nextPageToken=all_results["nextPageToken"],
 589                )
 590            )
 591            all_results["results"].extend(results_token["results"])
 592
 593            if "nextPageToken" in results_token:
 594                all_results["nextPageToken"] = results_token["nextPageToken"]
 595            else:
 596                del all_results["nextPageToken"]
 597
 598        return all_results
 599
 600    @tracer.start_as_current_span("SynapseStorage::getStorageProjects")
 601    def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]:
 602        """Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute.
 603
 604        Returns:
 605            A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName).
 606        """
 607
 608        # get the set of all storage Synapse project accessible for this pipeline
 609        storageProjects = self.storageFileviewTable["projectId"].unique()
 610
 611        # get the set of storage Synapse project accessible for this user
 612        # get a list of projects from Synapse
 613        current_user_project_headers = self.synapse_entity_tracker.get_project_headers(
 614            current_user_id=self.syn.credentials.owner_id, syn=self.syn
 615        )
 616        project_id_to_name_dict = {}
 617        current_user_projects = []
 618        for project_header in current_user_project_headers:
 619            project_id_to_name_dict[project_header.get("id")] = project_header.get(
 620                "name"
 621            )
 622            current_user_projects.append(project_header.get("id"))
 623
 624        # find set of user projects that are also in this pipeline's storage projects set
 625        storageProjects = list(set(storageProjects) & set(current_user_projects))
 626
 627        # Limit projects to scope if specified
 628        if project_scope:
 629            storageProjects = list(set(storageProjects) & set(project_scope))
 630
 631            if not storageProjects:
 632                raise Warning(
 633                    f"There are no projects that the user has access to that match the criteria of the specified project scope: {project_scope}"
 634                )
 635
 636        # prepare a return list of project IDs and names
 637        projects = []
 638        for projectId in storageProjects:
 639            project_name_from_project_header = project_id_to_name_dict.get(projectId)
 640            projects.append((projectId, project_name_from_project_header))
 641
 642        sorted_projects_list = sorted(projects, key=lambda tup: tup[0])
 643
 644        return sorted_projects_list
 645
 646    @tracer.start_as_current_span("SynapseStorage::getStorageDatasetsInProject")
 647    def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]:
 648        """Gets all datasets in folder under a given storage project that the current user has access to.
 649
 650        Args:
 651            projectId: synapse ID of a storage project.
 652
 653        Returns:
 654            A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName).
 655            None: If the projectId cannot be found on Synapse.
 656        """
 657
 658        # select all folders and fetch their names from within the storage project;
 659        # if folder content type is defined, only select folders that contain datasets
 660        if "contentType" in self.storageFileviewTable.columns:
 661            foldersTable = self.storageFileviewTable[
 662                (self.storageFileviewTable["contentType"] == "dataset")
 663                & (self.storageFileviewTable["projectId"] == projectId)
 664            ]
 665        else:
 666            foldersTable = self.storageFileviewTable[
 667                (self.storageFileviewTable["type"] == "folder")
 668                & (self.storageFileviewTable["parentId"] == projectId)
 669            ]
 670
 671        # get an array of tuples (folderId, folderName)
 672        # some folders are part of datasets; others contain datasets
 673        # each dataset parent is the project; folders part of a dataset have another folder as a parent
 674        # to get folders if and only if they contain datasets for each folder
 675        # check if folder's parent is the project; if so that folder contains a dataset,
 676        # unless the folder list has already been filtered to dataset folders based on contentType attribute above
 677
 678        datasetList = []
 679        folderProperties = ["id", "name"]
 680        for folder in list(
 681            foldersTable[folderProperties].itertuples(index=False, name=None)
 682        ):
 683            datasetList.append(folder)
 684
 685        sorted_dataset_list = sorted(datasetList, key=lambda tup: tup[0])
 686
 687        return sorted_dataset_list
 688
 689    @tracer.start_as_current_span("SynapseStorage::getFilesInStorageDataset")
 690    def getFilesInStorageDataset(
 691        self, datasetId: str, fileNames: List = None, fullpath: bool = True
 692    ) -> List[Tuple[str, str]]:
 693        """Gets all files (excluding manifest files) in a given dataset folder.
 694
 695        Args:
 696            datasetId: synapse ID of a storage dataset.
 697            fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g.
 698            metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present.
 699            fullpath: if True return the full path as part of this filename; otherwise return just base filename
 700
 701        Returns:
 702            A list of files; the list consists of tuples (fileId, fileName).
 703
 704        Raises:
 705            ValueError: Dataset ID not found.
 706        """
 707        file_list = []
 708
 709        # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view
 710        if self.storageFileviewTable.empty:
 711            raise ValueError(
 712                f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again."
 713            )
 714
 715        child_path = self.storageFileviewTable.loc[
 716            self.storageFileviewTable["parentId"] == datasetId, "path"
 717        ]
 718        if child_path.empty:
 719            raise LookupError(
 720                f"Dataset {datasetId} could not be found in fileview {self.storageFileview}."
 721            )
 722        child_path = child_path.iloc[0]
 723
 724        # Get the dataset path by eliminating the child's portion of the path to account for nested datasets
 725        parent = child_path.split("/")[:-1]
 726        parent = "/".join(parent)
 727
 728        # Format dataset path to be used in table query
 729        dataset_path = f"'{parent}/%'"
 730
 731        # When querying, only include files to exclude entity files and subdirectories
 732        where_clauses = [f"path like {dataset_path}", "type='file'"]
 733
 734        # Requery the fileview to specifically get the files in the given dataset
 735        self.query_fileview(columns=["id", "path"], where_clauses=where_clauses)
 736
 737        # Exclude manifest files
 738        non_manifest_files = self.storageFileviewTable.loc[
 739            ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"),
 740            :,
 741        ]
 742
 743        # Remove all files that are not in the list of fileNames
 744        if fileNames:
 745            filename_regex = "|".join(fileNames)
 746
 747            matching_files = non_manifest_files["path"].str.contains(
 748                filename_regex, case=False, regex=True
 749            )
 750
 751            non_manifest_files = non_manifest_files.loc[matching_files, :]
 752
 753        # Truncate path if necessary
 754        if not fullpath:
 755            non_manifest_files.path = non_manifest_files.path.apply(os.path.basename)
 756
 757        # Return list of files as expected by other methods
 758        file_list = list(non_manifest_files.itertuples(index=False, name=None))
 759
 760        return file_list
 761
 762    def _get_manifest_id(self, manifest: pd.DataFrame) -> str:
 763        """If both censored and uncensored manifests are present, return uncensored manifest; if only one manifest is present, return manifest id of that manifest; if more than two manifests are present, return the manifest id of the first one.
 764        Args:
 765        manifest: a dataframe contains name and id of manifests in a given asset view
 766
 767        Return:
 768        manifest_syn_id: id of a given censored or uncensored manifest
 769        """
 770        censored_regex = re.compile(".*censored.*")
 771        censored = manifest["name"].str.contains(censored_regex)
 772        if any(censored):
 773            # Try to use uncensored manifest first
 774            not_censored = ~censored
 775            if any(not_censored):
 776                manifest_syn_id = manifest[not_censored]["id"].iloc[0]
 777            # if only censored manifests are available, just use the first censored manifest
 778            else:
 779                manifest_syn_id = manifest["id"].iloc[0]
 780
 781        # otherwise, use the first (implied only) version that exists
 782        else:
 783            manifest_syn_id = manifest["id"].iloc[0]
 784
 785        return manifest_syn_id
 786
 787    @tracer.start_as_current_span("SynapseStorage::getDatasetManifest")
 788    def getDatasetManifest(
 789        self,
 790        datasetId: str,
 791        downloadFile: bool = False,
 792        newManifestName: str = "",
 793        use_temporary_folder: bool = True,
 794    ) -> Union[str, File]:
 795        """Gets the manifest associated with a given dataset.
 796
 797        Args:
 798            datasetId: synapse ID of a storage dataset.
 799            downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not.
 800            newManifestName: new name of a manifest that gets downloaded
 801            use_temporary_folder: boolean argument indicating if a temporary folder
 802                should be used to store the manifest file. This is useful when running
 803                this code as an API server where multiple requests could be made at the
 804                same time. This is set to False when the code is being used from the
 805                CLI. Defaults to True.
 806
 807        Returns:
 808            manifest_syn_id (String): Synapse ID of exisiting manifest file.
 809            manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True.
 810            "" (String): No pre-exisiting manifest in dataset.
 811        """
 812        manifest_data = ""
 813
 814        # get a list of files containing the manifest for this dataset (if any)
 815        all_files = self.storageFileviewTable
 816
 817        # construct regex based on manifest basename in the config
 818        manifest_re = re.compile(os.path.basename(self.manifest) + ".*.[tc]sv")
 819
 820        # search manifest based on given manifest basename regex above
 821        # and return a dataframe containing name and id of manifests in a given asset view
 822        manifest = all_files[
 823            (all_files["name"].str.contains(manifest_re, regex=True))
 824            & (all_files["parentId"] == datasetId)
 825        ]
 826
 827        manifest = manifest[["id", "name"]]
 828
 829        # if there is no pre-exisiting manifest in the specified dataset
 830        if manifest.empty:
 831            logger.warning(
 832                f"Could not find a manifest that fits basename {self.manifest} in asset view and dataset {datasetId}"
 833            )
 834            return ""
 835
 836        # if there is an exisiting manifest
 837        else:
 838            manifest_syn_id = self._get_manifest_id(manifest)
 839            if downloadFile:
 840                md = ManifestDownload(
 841                    self.syn,
 842                    manifest_id=manifest_syn_id,
 843                    synapse_entity_tracker=self.synapse_entity_tracker,
 844                )
 845                manifest_data = md.download_manifest(
 846                    newManifestName=newManifestName,
 847                    manifest_df=manifest,
 848                    use_temporary_folder=use_temporary_folder,
 849                )
 850                # TO DO: revisit how downstream code handle manifest_data. If the downstream code would break when manifest_data is an empty string,
 851                # then we should catch the error here without returning an empty string.
 852                if not manifest_data:
 853                    logger.debug(
 854                        f"No manifest data returned. Please check if you have successfully downloaded manifest: {manifest_syn_id}"
 855                    )
 856                return manifest_data
 857            return manifest_syn_id
 858
 859    def getDataTypeFromManifest(self, manifestId: str):
 860        """Fetch a manifest and return data types of all columns
 861        Args:
 862            manifestId: synapse ID of a manifest
 863        """
 864        # get manifest file path
 865        manifest_entity = self.synapse_entity_tracker.get(
 866            synapse_id=manifestId, syn=self.syn, download_file=True
 867        )
 868        manifest_filepath = manifest_entity.path
 869
 870        # load manifest dataframe
 871        manifest = load_df(
 872            manifest_filepath,
 873            preserve_raw_input=False,
 874            data_model=False,
 875        )
 876
 877        # convert the dataFrame to use best possible dtypes.
 878        manifest_new = manifest.convert_dtypes()
 879
 880        # get data types of columns
 881        result = manifest_new.dtypes.to_frame("dtypes").reset_index()
 882
 883        # return the result as a dictionary
 884        result_dict = result.set_index("index")["dtypes"].astype(str).to_dict()
 885
 886        return result_dict
 887
 888    def _get_files_metadata_from_dataset(
 889        self, datasetId: str, only_new_files: bool, manifest: pd.DataFrame = None
 890    ) -> Optional[dict]:
 891        """retrieve file ids under a particular datasetId
 892
 893        Args:
 894            datasetId (str): a dataset id
 895            only_new_files (bool): if only adding new files that are not already exist
 896            manifest (pd.DataFrame): metadata manifest dataframe. Default to None.
 897
 898        Returns:
 899            a dictionary that contains filename and entityid under a given datasetId or None if there is nothing under a given dataset id are not available
 900        """
 901        dataset_files = self.getFilesInStorageDataset(datasetId)
 902        if dataset_files:
 903            dataset_file_names_id_dict = self._get_file_entityIds(
 904                dataset_files, only_new_files=only_new_files, manifest=manifest
 905            )
 906            return dataset_file_names_id_dict
 907        else:
 908            return None
 909
 910    def add_entity_id_and_filename(
 911        self, datasetId: str, manifest: pd.DataFrame
 912    ) -> pd.DataFrame:
 913        """add entityid and filename column to an existing manifest assuming entityId column is not already present
 914
 915        Args:
 916            datasetId (str): dataset syn id
 917            manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty
 918
 919        Returns:
 920            pd.DataFrame: returns a pandas dataframe
 921        """
 922        # get file names and entity ids of a given dataset
 923        dataset_files_dict = self._get_files_metadata_from_dataset(
 924            datasetId, only_new_files=False
 925        )
 926
 927        if dataset_files_dict:
 928            # turn manifest dataframe back to a dictionary for operation
 929            manifest_dict = manifest.to_dict("list")
 930
 931            # update Filename column
 932            # add entityId column to the end
 933            manifest_dict.update(dataset_files_dict)
 934
 935            # if the component column exists in existing manifest, fill up that column
 936            if "Component" in manifest_dict.keys():
 937                manifest_dict["Component"] = manifest_dict["Component"] * max(
 938                    1, len(manifest_dict["Filename"])
 939                )
 940
 941            # turn dictionary back to a dataframe
 942            manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient="index")
 943            manifest_df_updated = manifest_df_index.transpose()
 944
 945            # fill na with empty string
 946            manifest_df_updated = manifest_df_updated.fillna("")
 947
 948            # drop index
 949            manifest_df_updated = manifest_df_updated.reset_index(drop=True)
 950
 951            return manifest_df_updated
 952        else:
 953            return manifest
 954
 955    def fill_in_entity_id_filename(
 956        self, datasetId: str, manifest: pd.DataFrame
 957    ) -> Tuple[List, pd.DataFrame]:
 958        """fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present.
 959
 960        Args:
 961            datasetId (str): dataset syn id
 962            manifest (pd.DataFrame): existing manifest dataframe.
 963
 964        Returns:
 965            Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe
 966        """
 967        # get dataset file names and entity id as a list of tuple
 968        dataset_files = self.getFilesInStorageDataset(datasetId)
 969
 970        # update manifest with additional filenames, if any
 971        # note that if there is an existing manifest and there are files in the dataset
 972        # the columns Filename and entityId are assumed to be present in manifest schema
 973        # TODO: use idiomatic panda syntax
 974        if not dataset_files:
 975            manifest = manifest.fillna("")
 976            return dataset_files, manifest
 977
 978        all_files = self._get_file_entityIds(
 979            dataset_files=dataset_files, only_new_files=False, manifest=manifest
 980        )
 981        new_files = self._get_file_entityIds(
 982            dataset_files=dataset_files, only_new_files=True, manifest=manifest
 983        )
 984
 985        all_files = pd.DataFrame(all_files)
 986        new_files = pd.DataFrame(new_files)
 987
 988        # update manifest so that it contains new dataset files
 989        manifest = (
 990            pd.concat([manifest, new_files], sort=False)
 991            .reset_index()
 992            .drop("index", axis=1)
 993        )
 994
 995        # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata
 996        manifest_reindex = manifest.set_index("entityId")
 997        all_files_reindex = all_files.set_index("entityId")
 998        all_files_reindex_like_manifest = all_files_reindex.reindex_like(
 999            manifest_reindex
1000        )
1001
1002        # Check if individual file paths in manifest and from synapse match
1003        file_paths_match = (
1004            manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"]
1005        )
1006
1007        # If all the paths do not match, update the manifest with the filepaths from synapse
1008        if not file_paths_match.all():
1009            manifest_reindex.loc[
1010                ~file_paths_match, "Filename"
1011            ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"]
1012
1013            # reformat manifest for further use
1014            manifest = manifest_reindex.reset_index()
1015            entityIdCol = manifest.pop("entityId")
1016            manifest.insert(len(manifest.columns), "entityId", entityIdCol)
1017
1018        manifest = manifest.fillna("")
1019        return dataset_files, manifest
1020
1021    @tracer.start_as_current_span("SynapseStorage::updateDatasetManifestFiles")
1022    def updateDatasetManifestFiles(
1023        self, dmge: DataModelGraphExplorer, datasetId: str, store: bool = True
1024    ) -> Union[Tuple[str, pd.DataFrame], None]:
1025        """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any.
1026
1027        Args:
1028            dmge: DataModelGraphExplorer Instance
1029            datasetId: synapse ID of a storage dataset.
1030            store: if set to True store updated manifest in asset store; if set to False
1031            return a Pandas dataframe containing updated manifest but do not store to asset store
1032
1033
1034        Returns:
1035            Synapse ID of updated manifest and Pandas dataframe containing the updated manifest.
1036            If there is no existing manifest or if the manifest does not have an entityId column, return None
1037        """
1038
1039        # get existing manifest Synapse ID
1040        manifest_id = self.getDatasetManifest(datasetId)
1041
1042        # if there is no manifest return None
1043        if not manifest_id:
1044            return None
1045
1046        manifest_entity = self.synapse_entity_tracker.get(
1047            synapse_id=manifest_id, syn=self.syn, download_file=True
1048        )
1049        manifest_filepath = manifest_entity.path
1050        manifest = load_df(manifest_filepath)
1051
1052        # If the manifest does not have an entityId column, trigger a new manifest to be generated
1053        if "entityId" not in manifest.columns:
1054            return None
1055
1056        manifest_is_file_based = "Filename" in manifest.columns
1057
1058        if manifest_is_file_based:
1059            # update manifest with additional filenames, if any
1060            # note that if there is an existing manifest and there are files in the dataset
1061            # the columns Filename and entityId are assumed to be present in manifest schema
1062            # TODO: use idiomatic panda syntax
1063            dataset_files, manifest = self.fill_in_entity_id_filename(
1064                datasetId, manifest
1065            )
1066            if dataset_files:
1067                # update the manifest file, so that it contains the relevant entity IDs
1068                if store:
1069                    manifest.to_csv(manifest_filepath, index=False)
1070
1071                    # store manifest and update associated metadata with manifest on Synapse
1072                    manifest_id = self.associateMetadataWithFiles(
1073                        dmge, manifest_filepath, datasetId
1074                    )
1075
1076        return manifest_id, manifest
1077
1078    def _get_file_entityIds(
1079        self,
1080        dataset_files: List,
1081        only_new_files: bool = False,
1082        manifest: pd.DataFrame = None,
1083    ):
1084        """
1085        Get a dictionary of files in a dataset. Either files that are not in the current manifest or all files
1086
1087        Args:
1088            manifest: metadata manifest
1089            dataset_file: List of all files in a dataset
1090            only_new_files: boolean to control whether only new files are returned or all files in the dataset
1091        Returns:
1092            files: dictionary of file names and entityIDs, with scope as specified by `only_new_files`
1093        """
1094        files = {"Filename": [], "entityId": []}
1095
1096        if only_new_files:
1097            if manifest is None:
1098                raise UnboundLocalError(
1099                    "No manifest was passed in, a manifest is required when `only_new_files` is True."
1100                )
1101
1102            if "entityId" not in manifest.columns:
1103                raise ValueError(
1104                    "The manifest in your dataset and/or top level folder must contain the 'entityId' column. "
1105                    "Please generate an empty manifest without annotations, manually add annotations to the "
1106                    "appropriate files in the manifest, and then try again."
1107                )
1108
1109            # find new files (that are not in the current manifest) if any
1110            for file_id, file_name in dataset_files:
1111                if not file_id in manifest["entityId"].values:
1112                    files["Filename"].append(file_name)
1113                    files["entityId"].append(file_id)
1114        else:
1115            # get all files
1116            for file_id, file_name in dataset_files:
1117                files["Filename"].append(file_name)
1118                files["entityId"].append(file_id)
1119
1120        return files
1121
1122    @tracer.start_as_current_span("SynapseStorage::getProjectManifests")
1123    def getProjectManifests(
1124        self, projectId: str
1125    ) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]:
1126        """Gets all metadata manifest files across all datasets in a specified project.
1127
1128        Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest
1129                 as a list of tuples, one for each manifest:
1130                    [
1131                        (
1132                            (datasetId, dataName),
1133                            (manifestId, manifestName),
1134                            (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema
1135                        ),
1136                        ...
1137                    ]
1138
1139        TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface
1140        """
1141        component = None
1142        entity = None
1143        manifests = []
1144
1145        datasets = self.getStorageDatasetsInProject(projectId)
1146
1147        for datasetId, datasetName in datasets:
1148            # encode information about the manifest in a simple list (so that R clients can unpack it)
1149            # eventually can serialize differently
1150
1151            # Get synID of manifest for a dataset
1152            manifestId = self.getDatasetManifest(datasetId)
1153
1154            # If a manifest exists, get the annotations for it, else return base 'manifest' tuple
1155            if manifestId:
1156                annotations = self.getFileAnnotations(manifestId)
1157
1158                # If manifest has annotations specifying component, use that
1159                if annotations and "Component" in annotations:
1160                    component = annotations["Component"]
1161                    entity = self.synapse_entity_tracker.get(
1162                        synapse_id=manifestId, syn=self.syn, download_file=False
1163                    )
1164                    manifest_name = entity["properties"]["name"]
1165
1166                # otherwise download the manifest and parse for information
1167                elif not annotations or "Component" not in annotations:
1168                    logging.debug(
1169                        f"No component annotations have been found for manifest {manifestId}. "
1170                        "The manifest will be downloaded and parsed instead. "
1171                        "For increased speed, add component annotations to manifest."
1172                    )
1173
1174                    manifest_info = self.getDatasetManifest(
1175                        datasetId, downloadFile=True
1176                    )
1177                    manifest_name = manifest_info["properties"].get("name", "")
1178
1179                    if not manifest_name:
1180                        logger.error(f"Failed to download manifests from {datasetId}")
1181
1182                    manifest_path = manifest_info["path"]
1183
1184                    manifest_df = load_df(manifest_path)
1185
1186                    # Get component from component column if it exists
1187                    if (
1188                        "Component" in manifest_df
1189                        and not manifest_df["Component"].empty
1190                    ):
1191                        list(set(manifest_df["Component"]))
1192                        component = list(set(manifest_df["Component"]))
1193
1194                        # Added to address issues raised during DCA testing
1195                        if "" in component:
1196                            component.remove("")
1197
1198                        if len(component) == 1:
1199                            component = component[0]
1200                        elif len(component) > 1:
1201                            logging.warning(
1202                                f"Manifest {manifestId} is composed of multiple components. Schematic does not support mulit-component manifests at this time."
1203                                "Behavior of manifests with multiple components is undefined"
1204                            )
1205            else:
1206                manifest_name = ""
1207                component = None
1208            if component:
1209                manifest = (
1210                    (datasetId, datasetName),
1211                    (manifestId, manifest_name),
1212                    (component, component),
1213                )
1214            elif manifestId:
1215                logging.debug(
1216                    f"Manifest {manifestId} does not have an associated Component"
1217                )
1218                manifest = (
1219                    (datasetId, datasetName),
1220                    (manifestId, manifest_name),
1221                    ("", ""),
1222                )
1223            else:
1224                manifest = (
1225                    (datasetId, datasetName),
1226                    ("", ""),
1227                    ("", ""),
1228                )
1229
1230            if manifest:
1231                manifests.append(manifest)
1232
1233        return manifests
1234
1235    def upload_project_manifests_to_synapse(
1236        self, dmge: DataModelGraphExplorer, projectId: str
1237    ) -> List[str]:
1238        """Upload all metadata manifest files across all datasets in a specified project as tables in Synapse.
1239
1240        Returns: String of all the manifest_table_ids of all the manifests that have been loaded.
1241        """
1242
1243        manifests = []
1244        manifest_loaded = []
1245        datasets = self.getStorageDatasetsInProject(projectId)
1246
1247        for datasetId, datasetName in datasets:
1248            # encode information about the manifest in a simple list (so that R clients can unpack it)
1249            # eventually can serialize differently
1250
1251            manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1252
1253            manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1254            if manifest_info:
1255                manifest_id = manifest_info["properties"]["id"]
1256                manifest_name = manifest_info["properties"]["name"]
1257                manifest_path = manifest_info["path"]
1258                manifest_df = load_df(manifest_path)
1259                manifest_table_id = uploadDB(
1260                    dmge=dmge,
1261                    manifest=manifest,
1262                    datasetId=datasetId,
1263                    table_name=datasetName,
1264                )
1265                manifest_loaded.append(datasetName)
1266        return manifest_loaded
1267
1268    def upload_annotated_project_manifests_to_synapse(
1269        self, projectId: str, path_to_json_ld: str, dry_run: bool = False
1270    ) -> List[str]:
1271        """
1272        Purpose:
1273            For all manifests in a project, upload them as a table and add annotations manifest csv.
1274            Assumes the manifest is already present as a CSV in a dataset in the project.
1275
1276        """
1277        # Instantiate DataModelParser
1278        data_model_parser = DataModelParser(path_to_data_model=path_to_json_ld)
1279        # Parse Model
1280        parsed_data_model = data_model_parser.parse_model()
1281
1282        # Instantiate DataModelGraph
1283        data_model_grapher = DataModelGraph(parsed_data_model)
1284
1285        # Generate graph
1286        graph_data_model = data_model_grapher.generate_data_model_graph()
1287
1288        # Instantiate DataModelGraphExplorer
1289        dmge = DataModelGraphExplorer(graph_data_model)
1290
1291        manifests = []
1292        manifest_loaded = []
1293        datasets = self.getStorageDatasetsInProject(projectId)
1294        for datasetId, datasetName in datasets:
1295            # encode information about the manifest in a simple list (so that R clients can unpack it)
1296            # eventually can serialize differently
1297
1298            manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1299            manifests.append(manifest)
1300
1301            manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1302
1303            if manifest_info:
1304                manifest_id = manifest_info["properties"]["id"]
1305                manifest_name = manifest_info["properties"]["name"]
1306                manifest_path = manifest_info["path"]
1307                manifest = (
1308                    (datasetId, datasetName),
1309                    (manifest_id, manifest_name),
1310                    ("", ""),
1311                )
1312                if not dry_run:
1313                    self.associateMetadataWithFiles(
1314                        dmge, manifest_path, datasetId, manifest_record_type="table"
1315                    )
1316                manifest_loaded.append(manifest)
1317
1318        return manifests, manifest_loaded
1319
1320    def move_entities_to_new_project(
1321        self,
1322        projectId: str,
1323        newProjectId: str,
1324        returnEntities: bool = False,
1325        dry_run: bool = False,
1326    ):
1327        """
1328        For each manifest csv in a project, look for all the entitiy ids that are associated.
1329        Look up the entitiy in the files, move the entity to new project.
1330        """
1331
1332        manifests = []
1333        manifest_loaded = []
1334        datasets = self.getStorageDatasetsInProject(projectId)
1335        if datasets:
1336            for datasetId, datasetName in datasets:
1337                # encode information about the manifest in a simple list (so that R clients can unpack it)
1338                # eventually can serialize differently
1339
1340                manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1341                manifests.append(manifest)
1342
1343                manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1344                if manifest_info:
1345                    manifest_id = manifest_info["properties"]["id"]
1346                    manifest_name = manifest_info["properties"]["name"]
1347                    manifest_path = manifest_info["path"]
1348                    manifest_df = load_df(manifest_path)
1349
1350                    manifest = (
1351                        (datasetId, datasetName),
1352                        (manifest_id, manifest_name),
1353                        ("", ""),
1354                    )
1355                    manifest_loaded.append(manifest)
1356
1357                    annotation_entities = self.storageFileviewTable[
1358                        (self.storageFileviewTable["id"].isin(manifest_df["entityId"]))
1359                        & (self.storageFileviewTable["type"] == "folder")
1360                    ]["id"]
1361
1362                    if returnEntities:
1363                        for entityId in annotation_entities:
1364                            if not dry_run:
1365                                moved_entity = self.syn.move(entityId, datasetId)
1366                                self.synapse_entity_tracker.add(
1367                                    synapse_id=moved_entity.id, entity=moved_entity
1368                                )
1369                            else:
1370                                logging.info(
1371                                    f"{entityId} will be moved to folder {datasetId}."
1372                                )
1373                    else:
1374                        # generate project folder
1375                        archive_project_folder = Folder(
1376                            projectId + "_archive", parent=newProjectId
1377                        )
1378                        archive_project_folder = self.syn.store(archive_project_folder)
1379                        self.synapse_entity_tracker.add(
1380                            synapse_id=archive_project_folder.id,
1381                            entity=archive_project_folder,
1382                        )
1383
1384                        # generate dataset folder
1385                        dataset_archive_folder = Folder(
1386                            "_".join([datasetId, datasetName, "archive"]),
1387                            parent=archive_project_folder.id,
1388                        )
1389                        dataset_archive_folder = self.syn.store(dataset_archive_folder)
1390                        self.synapse_entity_tracker.add(
1391                            synapse_id=dataset_archive_folder.id,
1392                            entity=dataset_archive_folder,
1393                        )
1394
1395                        for entityId in annotation_entities:
1396                            # move entities to folder
1397                            if not dry_run:
1398                                moved_entity = self.syn.move(
1399                                    entityId, dataset_archive_folder.id
1400                                )
1401                                self.synapse_entity_tracker.add(
1402                                    synapse_id=moved_entity.id, entity=moved_entity
1403                                )
1404                            else:
1405                                logging.info(
1406                                    f"{entityId} will be moved to folder {dataset_archive_folder.id}."
1407                                )
1408        else:
1409            raise LookupError(
1410                f"No datasets were found in the specified project: {projectId}. Re-check specified master_fileview in CONFIG and retry."
1411            )
1412        return manifests, manifest_loaded
1413
1414    @tracer.start_as_current_span("SynapseStorage::get_synapse_table")
1415    def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable]:
1416        """Download synapse table as a pd dataframe; return table schema and etags as results too
1417
1418        Args:
1419            synapse_id: synapse ID of the table to query
1420        """
1421
1422        results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id))
1423        df = results.asDataFrame(
1424            rowIdAndVersionInIndex=False,
1425            na_values=STR_NA_VALUES_FILTERED,
1426            keep_default_na=False,
1427        )
1428
1429        return df, results
1430
1431    @missing_entity_handler
1432    @tracer.start_as_current_span("SynapseStorage::uploadDB")
1433    def uploadDB(
1434        self,
1435        dmge: DataModelGraphExplorer,
1436        manifest: pd.DataFrame,
1437        datasetId: str,
1438        table_name: str,
1439        restrict: bool = False,
1440        table_manipulation: str = "replace",
1441        table_column_names: str = "class_label",
1442    ):
1443        """
1444        Method to upload a database to an asset store. In synapse, this will upload a metadata table
1445
1446        Args:
1447            dmge: DataModelGraphExplorer object
1448            manifest: pd.Df manifest to upload
1449            datasetId: synID of the dataset for the manifest
1450            table_name: name of the table to be uploaded
1451            restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
1452            existingTableId: str of the synId of the existing table, if one already exists
1453            table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
1454            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
1455                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
1456                display label formatting.
1457        Returns:
1458            manifest_table_id: synID of the uploaded table
1459            manifest: the original manifset
1460            table_manifest: manifest formatted appropriately for the table
1461
1462        """
1463
1464        col_schema, table_manifest = self.formatDB(
1465            dmge=dmge, manifest=manifest, table_column_names=table_column_names
1466        )
1467
1468        manifest_table_id = self.buildDB(
1469            datasetId,
1470            table_name,
1471            col_schema,
1472            table_manifest,
1473            table_manipulation,
1474            dmge,
1475            restrict,
1476        )
1477
1478        return manifest_table_id, manifest, table_manifest
1479
1480    @tracer.start_as_current_span("SynapseStorage::formatDB")
1481    def formatDB(self, dmge, manifest, table_column_names):
1482        """
1483        Method to format a manifest appropriatly for upload as table
1484
1485        Args:
1486            dmge: DataModelGraphExplorer object
1487            manifest: pd.Df manifest to upload
1488            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
1489                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
1490                display label formatting.
1491        Returns:
1492            col_schema: schema for table columns: type, size, etc
1493            table_manifest: formatted manifest
1494
1495        """
1496        # Rename the manifest columns to display names to match fileview
1497
1498        blacklist_chars = ["(", ")", ".", " ", "-"]
1499        manifest_columns = manifest.columns.tolist()
1500
1501        table_manifest = deepcopy(manifest)
1502
1503        if table_column_names == "display_name":
1504            cols = table_manifest.columns
1505
1506        elif table_column_names == "display_label":
1507            cols = [
1508                str(col).translate({ord(x): "" for x in blacklist_chars})
1509                for col in manifest_columns
1510            ]
1511
1512        elif table_column_names == "class_label":
1513            cols = [
1514                get_class_label_from_display_name(str(col)).translate(
1515                    {ord(x): "" for x in blacklist_chars}
1516                )
1517                for col in manifest_columns
1518            ]
1519        else:
1520            ValueError(
1521                f"The provided table_column_name: {table_column_names} is not valid, please resubmit with an allowed value only."
1522            )
1523
1524        cols = list(map(lambda x: x.replace("EntityId", "entityId"), cols))
1525
1526        # Reset column names in table manifest
1527        table_manifest.columns = cols
1528
1529        # move entity id to end of df
1530        entity_col = table_manifest.pop("entityId")
1531        table_manifest.insert(len(table_manifest.columns), "entityId", entity_col)
1532
1533        # Get the column schema
1534        col_schema = as_table_columns(table_manifest)
1535
1536        # Set Id column length to 64 (for some reason not being auto set.)
1537        for i, col in enumerate(col_schema):
1538            if col["name"].lower() == "id":
1539                col_schema[i]["maximumSize"] = 64
1540
1541        return col_schema, table_manifest
1542
1543    @tracer.start_as_current_span("SynapseStorage::buildDB")
1544    def buildDB(
1545        self,
1546        datasetId: str,
1547        table_name: str,
1548        col_schema: List,
1549        table_manifest: pd.DataFrame,
1550        table_manipulation: str,
1551        dmge: DataModelGraphExplorer,
1552        restrict: bool = False,
1553    ):
1554        """
1555        Method to construct the table appropriately: create new table, replace existing, or upsert new into existing
1556        Calls TableOperations class to execute
1557
1558        Args:
1559            datasetId: synID of the dataset for the manifest
1560            table_name: name of the table to be uploaded
1561            col_schema: schema for table columns: type, size, etc from `formatDB`
1562            table_manifest: formatted manifest that can be uploaded as a table
1563            table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
1564            restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
1565
1566        Returns:
1567            manifest_table_id: synID of the uploaded table
1568
1569        """
1570        table_parent_id = self.getDatasetProject(datasetId=datasetId)
1571        existing_table_id = self.syn.findEntityId(
1572            name=table_name, parent=table_parent_id
1573        )
1574
1575        tableOps = TableOperations(
1576            synStore=self,
1577            tableToLoad=table_manifest,
1578            tableName=table_name,
1579            datasetId=datasetId,
1580            existingTableId=existing_table_id,
1581            restrict=restrict,
1582            synapse_entity_tracker=self.synapse_entity_tracker,
1583        )
1584
1585        if not table_manipulation or existing_table_id is None:
1586            manifest_table_id = tableOps.createTable(
1587                columnTypeDict=col_schema,
1588                specifySchema=True,
1589            )
1590        elif existing_table_id is not None:
1591            if table_manipulation.lower() == "replace":
1592                manifest_table_id = tableOps.replaceTable(
1593                    specifySchema=True,
1594                    columnTypeDict=col_schema,
1595                )
1596            elif table_manipulation.lower() == "upsert":
1597                manifest_table_id = tableOps.upsertTable(
1598                    dmge=dmge,
1599                )
1600            elif table_manipulation.lower() == "update":
1601                manifest_table_id = tableOps.updateTable()
1602
1603        if table_manipulation and table_manipulation.lower() == "upsert":
1604            table_entity = self.synapse_entity_tracker.get(
1605                synapse_id=existing_table_id or manifest_table_id,
1606                syn=self.syn,
1607                download_file=False,
1608            )
1609            annos = OldAnnotations(
1610                id=table_entity.id,
1611                etag=table_entity.etag,
1612                values=table_entity.annotations,
1613            )
1614            annos["primary_key"] = table_manifest["Component"][0] + "_id"
1615            annos = self.syn.set_annotations(annos)
1616            table_entity.etag = annos.etag
1617            table_entity.annotations = annos
1618
1619        return manifest_table_id
1620
1621    @tracer.start_as_current_span("SynapseStorage::upload_manifest_file")
1622    def upload_manifest_file(
1623        self,
1624        manifest,
1625        metadataManifestPath,
1626        datasetId,
1627        restrict_manifest,
1628        component_name="",
1629    ):
1630        # Update manifest to have the new entityId column
1631        manifest.to_csv(metadataManifestPath, index=False)
1632
1633        # store manifest to Synapse as a CSV
1634        # update file name
1635        file_name_full = metadataManifestPath.split("/")[-1]
1636        file_extension = file_name_full.split(".")[-1]
1637
1638        # Differentiate "censored" and "uncensored" manifest
1639        if "censored" in file_name_full:
1640            file_name_new = (
1641                os.path.basename(CONFIG.synapse_manifest_basename)
1642                + "_"
1643                + component_name
1644                + "_censored"
1645                + "."
1646                + file_extension
1647            )
1648        else:
1649            file_name_new = (
1650                os.path.basename(CONFIG.synapse_manifest_basename)
1651                + "_"
1652                + component_name
1653                + "."
1654                + file_extension
1655            )
1656
1657        manifest_synapse_file = None
1658        try:
1659            # Rename the file to file_name_new then revert
1660            # This is to maintain the original file name in-case other code is
1661            # expecting that the file exists with the original name
1662            original_file_path = metadataManifestPath
1663            new_file_path = os.path.join(
1664                os.path.dirname(metadataManifestPath), file_name_new
1665            )
1666            os.rename(original_file_path, new_file_path)
1667
1668            manifest_synapse_file = self._store_file_for_manifest_upload(
1669                new_file_path=new_file_path,
1670                dataset_id=datasetId,
1671                existing_file_name=file_name_full,
1672                file_name_new=file_name_new,
1673                restrict_manifest=restrict_manifest,
1674            )
1675            manifest_synapse_file_id = manifest_synapse_file.id
1676
1677        finally:
1678            # Revert the file name back to the original
1679            os.rename(new_file_path, original_file_path)
1680
1681            if manifest_synapse_file:
1682                manifest_synapse_file.path = original_file_path
1683
1684        return manifest_synapse_file_id
1685
1686    def _store_file_for_manifest_upload(
1687        self,
1688        new_file_path: str,
1689        dataset_id: str,
1690        existing_file_name: str,
1691        file_name_new: str,
1692        restrict_manifest: bool,
1693    ) -> File:
1694        """Handles a create or update of a manifest file that is going to be uploaded.
1695        If we already have a copy of the Entity in memory we will update that instance,
1696        otherwise create a new File instance to be created in Synapse. Once stored
1697        this will add the file to the `synapse_entity_tracker` for future reference.
1698
1699        Args:
1700            new_file_path (str): The path to the new manifest file
1701            dataset_id (str): The Synapse ID of the dataset the manifest is associated with
1702            existing_file_name (str): The name of the existing file
1703            file_name_new (str): The name of the new file
1704            restrict_manifest (bool): Whether the manifest should be restricted
1705
1706        Returns:
1707            File: The stored manifest file
1708        """
1709        local_tracked_file_instance = (
1710            self.synapse_entity_tracker.search_local_by_parent_and_name(
1711                name=existing_file_name, parent_id=dataset_id
1712            )
1713            or self.synapse_entity_tracker.search_local_by_parent_and_name(
1714                name=file_name_new, parent_id=dataset_id
1715            )
1716        )
1717
1718        if local_tracked_file_instance:
1719            local_tracked_file_instance.path = new_file_path
1720            local_tracked_file_instance.description = (
1721                "Manifest for dataset " + dataset_id
1722            )
1723            manifest_synapse_file = local_tracked_file_instance
1724        else:
1725            manifest_synapse_file = File(
1726                path=new_file_path,
1727                description="Manifest for dataset " + dataset_id,
1728                parent=dataset_id,
1729                name=file_name_new,
1730            )
1731
1732        manifest_synapse_file = self.syn.store(
1733            manifest_synapse_file, isRestricted=restrict_manifest
1734        )
1735
1736        self.synapse_entity_tracker.add(
1737            synapse_id=manifest_synapse_file.id, entity=manifest_synapse_file
1738        )
1739        return manifest_synapse_file
1740
1741    async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]:
1742        """get annotations asynchronously
1743
1744        Args:
1745            synapse_id (str): synapse id of the entity that the annotation belongs
1746
1747        Returns:
1748            Dict[str, Any]: The requested entity bundle matching
1749            <https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html>
1750        """
1751        return await get_entity_id_bundle2(
1752            entity_id=synapse_id,
1753            request={"includeAnnotations": True},
1754            synapse_client=self.syn,
1755        )
1756
1757    async def store_async_annotation(self, annotation_dict: dict) -> Annotations:
1758        """store annotation in an async way
1759
1760        Args:
1761            annotation_dict (dict): annotation in a dictionary format
1762
1763        Returns:
1764            Annotations: The stored annotations.
1765        """
1766        annotation_data = Annotations.from_dict(
1767            synapse_annotations=annotation_dict["annotations"]["annotations"]
1768        )
1769        annotation_class = Annotations(
1770            annotations=annotation_data,
1771            etag=annotation_dict["annotations"]["etag"],
1772            id=annotation_dict["annotations"]["id"],
1773        )
1774        annotation_storage_result = await annotation_class.store_async(
1775            synapse_client=self.syn
1776        )
1777        local_entity = self.synapse_entity_tracker.get(
1778            synapse_id=annotation_dict["annotations"]["id"],
1779            syn=self.syn,
1780            download_file=False,
1781            retrieve_if_not_present=False,
1782        )
1783        if local_entity:
1784            local_entity.etag = annotation_storage_result.etag
1785            local_entity.annotations = annotation_storage_result
1786        return annotation_storage_result
1787
1788    def process_row_annotations(
1789        self,
1790        dmge: DataModelGraphExplorer,
1791        metadata_syn: Dict[str, Any],
1792        hide_blanks: bool,
1793        csv_list_regex: str,
1794        annos: Dict[str, Any],
1795        annotation_keys: str,
1796    ) -> Dict[str, Any]:
1797        """Processes metadata annotations based on the logic below:
1798        1. Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is:
1799            An empty or whitespace-only string.
1800            A NaN value (if the annotation is a float).
1801        if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key.
1802        if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key.
1803
1804        2. If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name".
1805        Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key.
1806
1807        3. For any other conditions, assigns the original value of anno_v to the annotation key (anno_k).
1808
1809        4. Returns the updated annotations dictionary.
1810
1811        Args:
1812            dmge (DataModelGraphExplorer): data model graph explorer
1813            metadata_syn (dict): metadata used for Synapse storage
1814            hideBlanks (bool): if true, does not upload annotation keys with blank values.
1815            csv_list_regex (str): Regex to match with comma separated list
1816            annos (Dict[str, Any]): dictionary of annotation returned from synapse
1817            annotation_keys (str): display_label/class_label
1818
1819        Returns:
1820            Dict[str, Any]: annotations as a dictionary
1821
1822        ```mermaid
1823        flowchart TD
1824            A[Start] --> C{Is anno_v empty, whitespace, or NaN?}
1825            C -- Yes --> D{Is hide_blanks True?}
1826            D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing]
1827            D -- No --> F[Assign empty string to annotation key]
1828            C -- No --> G{Is anno_v a string?}
1829            G -- No --> H[Assign original value of anno_v to annotation key]
1830            G -- Yes --> I{Does anno_v match csv_list_regex?}
1831            I -- Yes --> J[Get validation rule of anno_k]
1832            J --> K{Does the validation rule contain 'list'}
1833            K -- Yes --> L[Split anno_v by commas and assign as list]
1834            I -- No --> H
1835            K -- No --> H
1836        ```
1837        """
1838        for anno_k, anno_v in metadata_syn.items():
1839            # Remove keys with nan or empty string values or string that only contains white space from dict of annotations to be uploaded
1840            # if present on current data annotation
1841            if hide_blanks and (
1842                (isinstance(anno_v, str) and anno_v.strip() == "")
1843                or (isinstance(anno_v, float) and np.isnan(anno_v))
1844            ):
1845                annos["annotations"]["annotations"].pop(anno_k) if anno_k in annos[
1846                    "annotations"
1847                ]["annotations"].keys() else annos["annotations"]["annotations"]
1848                continue
1849
1850            # Otherwise save annotation as approrpriate
1851            if isinstance(anno_v, float) and np.isnan(anno_v):
1852                annos["annotations"]["annotations"][anno_k] = ""
1853                continue
1854
1855            # Handle strings that match the csv_list_regex and pass the validation rule
1856            if isinstance(anno_v, str) and re.fullmatch(csv_list_regex, anno_v):
1857                # Use a dictionary to dynamically choose the argument
1858                param = (
1859                    {"node_display_name": anno_k}
1860                    if annotation_keys == "display_label"
1861                    else {"node_label": anno_k}
1862                )
1863                node_validation_rules = dmge.get_node_validation_rules(**param)
1864
1865                if rule_in_rule_list("list", node_validation_rules):
1866                    annos["annotations"]["annotations"][anno_k] = anno_v.split(",")
1867                    continue
1868            # default: assign the original value
1869            annos["annotations"]["annotations"][anno_k] = anno_v
1870
1871        return annos
1872
1873    @async_missing_entity_handler
1874    async def format_row_annotations(
1875        self,
1876        dmge: DataModelGraphExplorer,
1877        row: pd.Series,
1878        entityId: str,
1879        hideBlanks: bool,
1880        annotation_keys: str,
1881    ) -> Union[None, Dict[str, Any]]:
1882        """Format row annotations
1883
1884        Args:
1885            dmge (DataModelGraphExplorer): data moodel graph explorer object
1886            row (pd.Series): row of the manifest
1887            entityId (str): entity id of the manifest
1888            hideBlanks (bool): when true, does not upload annotation keys with blank values. When false, upload Annotation keys with empty string values
1889            annotation_keys (str): display_label/class_label
1890
1891        Returns:
1892            Union[None, Dict[str,]]: if entity id is in trash can, return None. Otherwise, return the annotations
1893        """
1894        # prepare metadata for Synapse storage (resolve display name into a name that Synapse annotations support (e.g no spaces, parenthesis)
1895        # note: the removal of special characters, will apply only to annotation keys; we are not altering the manifest
1896        # this could create a divergence between manifest column and annotations. this should be ok for most use cases.
1897        # columns with special characters are outside of the schema
1898        metadataSyn = {}
1899        blacklist_chars = ["(", ")", ".", " ", "-"]
1900
1901        for k, v in row.to_dict().items():
1902            if annotation_keys == "display_label":
1903                keySyn = str(k).translate({ord(x): "" for x in blacklist_chars})
1904            elif annotation_keys == "class_label":
1905                keySyn = get_class_label_from_display_name(str(k)).translate(
1906                    {ord(x): "" for x in blacklist_chars}
1907                )
1908
1909            # Skip `Filename` and `ETag` columns when setting annotations
1910            if keySyn in ["Filename", "ETag", "eTag"]:
1911                continue
1912
1913            # truncate annotation values to 500 characters if the
1914            # size of values is greater than equal to 500 characters
1915            # add an explicit [truncatedByDataCuratorApp] message at the end
1916            # of every truncated message to indicate that the cell value
1917            # has been truncated
1918            if isinstance(v, str) and len(v) >= 500:
1919                v = v[0:472] + "[truncatedByDataCuratorApp]"
1920
1921            metadataSyn[keySyn] = v
1922
1923        # This will first check if the entity is already in memory, and if so, that
1924        # instance is used. Unfortunately, the expected return format needs to match
1925        # the Synapse API, so we need to convert the annotations to the expected format.
1926        entity = self.synapse_entity_tracker.get(
1927            synapse_id=entityId,
1928            syn=self.syn,
1929            download_file=False,
1930            retrieve_if_not_present=False,
1931        )
1932        if entity is not None:
1933            synapse_annotations = _convert_to_annotations_list(
1934                annotations=entity.annotations
1935            )
1936            annos = {
1937                "annotations": {
1938                    "id": entity.id,
1939                    "etag": entity.etag,
1940                    "annotations": synapse_annotations,
1941                }
1942            }
1943        else:
1944            annos = await self.get_async_annotation(entityId)
1945
1946        # set annotation(s) for the various objects/items in a dataset on Synapse
1947        csv_list_regex = comma_separated_list_regex()
1948
1949        annos = self.process_row_annotations(
1950            dmge=dmge,
1951            metadata_syn=metadataSyn,
1952            hide_blanks=hideBlanks,
1953            csv_list_regex=csv_list_regex,
1954            annos=annos,
1955            annotation_keys=annotation_keys,
1956        )
1957
1958        return annos
1959
1960    @missing_entity_handler
1961    @tracer.start_as_current_span("SynapseStorage::format_manifest_annotations")
1962    def format_manifest_annotations(self, manifest, manifest_synapse_id):
1963        """
1964        Set annotations for the manifest (as a whole) so they can be applied to the manifest table or csv.
1965        For now just getting the Component.
1966        """
1967
1968        entity = self.synapse_entity_tracker.get(
1969            synapse_id=manifest_synapse_id, syn=self.syn, download_file=False
1970        )
1971        is_file = entity.concreteType.endswith(".FileEntity")
1972        is_table = entity.concreteType.endswith(".TableEntity")
1973
1974        if is_file:
1975            # Get file metadata
1976            metadata = self.getFileAnnotations(manifest_synapse_id)
1977
1978            # If there is a defined component add it to the metadata.
1979            if "Component" in manifest.columns:
1980                # Gather component information
1981                component = manifest["Component"].unique()
1982
1983                # Double check that only a single component is listed, else raise an error.
1984                try:
1985                    len(component) == 1
1986                except ValueError as err:
1987                    raise ValueError(
1988                        f"Manifest has more than one component. Please check manifest and resubmit."
1989                    ) from err
1990
1991                # Add component to metadata
1992                metadata["Component"] = component[0]
1993
1994        elif is_table:
1995            # Get table metadata
1996            metadata = self.getTableAnnotations(manifest_synapse_id)
1997
1998        # Get annotations
1999        annos = OldAnnotations(
2000            id=entity.id, etag=entity.etag, values=entity.annotations
2001        )
2002
2003        # Add metadata to the annotations
2004        for annos_k, annos_v in metadata.items():
2005            annos[annos_k] = annos_v
2006
2007        return annos
2008
2009    '''
2010    def annotate_upload_manifest_table(self, manifest, datasetId, metadataManifestPath,
2011        useSchemaLabel: bool = True, hideBlanks: bool = False, restrict_manifest = False):
2012        """
2013        Purpose:
2014            Works very similarly to associateMetadataWithFiles except takes in the manifest
2015            rather than the manifest path
2016
2017        """
2018
2019        # Add uuid for table updates and fill.
2020        if not "Uuid" in manifest.columns:
2021            manifest["Uuid"] = ''
2022
2023        for idx,row in manifest.iterrows():
2024            if not row["Uuid"]:
2025                gen_uuid = uuid.uuid4()
2026                row["Uuid"] = gen_uuid
2027                manifest.loc[idx, 'Uuid'] = gen_uuid
2028
2029        # add entityId as a column if not already there or
2030        # fill any blanks with an empty string.
2031        if not "entityId" in manifest.columns:
2032            manifest["entityId"] = ""
2033        else:
2034            manifest["entityId"].fillna("", inplace=True)
2035
2036        # get a DataModelGraphExplorer object to ensure schema attribute names used in manifest are translated to schema labels for synapse annotations
2037        dmge = DataModelGraphExplorer()
2038
2039        # Create table name here.
2040        if 'Component' in manifest.columns:
2041            table_name = manifest['Component'][0].lower() + '_synapse_storage_manifest_table'
2042        else:
2043            table_name = 'synapse_storage_manifest_table'
2044
2045        # Upload manifest as a table and get the SynID and manifest
2046        manifest_synapse_table_id, manifest, table_manifest = self.upload_format_manifest_table(
2047                                                    dmge, manifest, datasetId, table_name, restrict = restrict_manifest, useSchemaLabel=useSchemaLabel,)
2048
2049        # Iterate over manifest rows, create Synapse entities and store corresponding entity IDs in manifest if needed
2050        # also set metadata for each synapse entity as Synapse annotations
2051        for idx, row in manifest.iterrows():
2052            if not row["entityId"]:
2053                # If not using entityIds, fill with manifest_table_id so
2054                row["entityId"] = manifest_synapse_table_id
2055                entityId = ''
2056            else:
2057                # get the entity id corresponding to this row
2058                entityId = row["entityId"]
2059
2060        # Load manifest to synapse as a CSV File
2061        manifest_synapse_file_id = self.upload_manifest_file(manifest, metadataManifestPath, datasetId, restrict_manifest)
2062
2063        # Get annotations for the file manifest.
2064        manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_file_id)
2065
2066        self.syn.set_annotations(manifest_annotations)
2067
2068        logger.info("Associated manifest file with dataset on Synapse.")
2069
2070        # Update manifest Synapse table with new entity id column.
2071        self.make_synapse_table(
2072            table_to_load = table_manifest,
2073            dataset_id = datasetId,
2074            existingTableId = manifest_synapse_table_id,
2075            table_name = table_name,
2076            update_col = 'Uuid',
2077            specify_schema = False,
2078            )
2079
2080        # Get annotations for the table manifest
2081        manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_table_id)
2082        self.syn.set_annotations(manifest_annotations)
2083        return manifest_synapse_table_id
2084    '''
2085
2086    def _read_manifest(self, metadataManifestPath: str) -> pd.DataFrame:
2087        """Helper function to read in provided manifest as a pandas DataFrame for subsequent downstream processing.
2088        Args:
2089            metadataManifestPath (str): path where manifest is stored
2090        Returns:
2091            manifest(pd.DataFrame): Manifest loaded as a pandas dataframe
2092        Raises:
2093            FileNotFoundError: Manifest file does not exist at provided path.
2094        """
2095        # read new manifest csv
2096        try:
2097            load_args = {
2098                "dtype": "string",
2099            }
2100            manifest = load_df(
2101                metadataManifestPath,
2102                preserve_raw_input=False,
2103                allow_na_values=False,
2104                **load_args,
2105            )
2106        except FileNotFoundError as err:
2107            raise FileNotFoundError(
2108                f"No manifest file was found at this path: {metadataManifestPath}"
2109            ) from err
2110        return manifest
2111
2112    def _add_id_columns_to_manifest(
2113        self, manifest: pd.DataFrame, dmge: DataModelGraphExplorer
2114    ):
2115        """Helper function to add id and entityId columns to the manifest if they do not already exist, Fill id values per row.
2116        Args:
2117            Manifest loaded as a pd.Dataframe
2118        Returns (pd.DataFrame):
2119            Manifest df with new Id and EntityId columns (and UUID values) if they were not already present.
2120        """
2121
2122        # Add Id for table updates and fill.
2123        if not col_in_dataframe("Id", manifest):
2124            # See if schema has `Uuid` column specified
2125            try:
2126                uuid_col_in_schema = dmge.is_class_in_schema(
2127                    "Uuid"
2128                ) or dmge.is_class_in_schema("uuid")
2129            except KeyError:
2130                uuid_col_in_schema = False
2131
2132            # Rename `Uuid` column if it wasn't specified in the schema
2133            if col_in_dataframe("Uuid", manifest) and not uuid_col_in_schema:
2134                manifest.rename(columns={"Uuid": "Id"}, inplace=True)
2135            # If no `Uuid` column exists or it is specified in the schema, create a new `Id` column
2136            else:
2137                manifest["Id"] = ""
2138
2139        # Retrieve the ID column name (id, Id and ID) are treated the same.
2140        id_col_name = [col for col in manifest.columns if col.lower() == "id"][0]
2141
2142        # Check if values have been added to the Id coulumn, if not add a UUID so value in the row is not blank.
2143        for idx, row in manifest.iterrows():
2144            if not row[id_col_name]:
2145                gen_uuid = str(uuid.uuid4())
2146                row[id_col_name] = gen_uuid
2147                manifest.loc[idx, id_col_name] = gen_uuid
2148
2149        # add entityId as a column if not already there or
2150        # fill any blanks with an empty string.
2151        if not col_in_dataframe("entityId", manifest):
2152            manifest["entityId"] = ""
2153        else:
2154            manifest["entityId"].fillna("", inplace=True)
2155
2156        return manifest
2157
2158    def _generate_table_name(self, manifest):
2159        """Helper function to generate a table name for upload to synapse.
2160
2161        Args:
2162            Manifest loaded as a pd.Dataframe
2163
2164        Returns:
2165            table_name (str): Name of the table to load
2166            component_name (str): Name of the manifest component (if applicable)
2167        """
2168        # Create table name here.
2169        if "Component" in manifest.columns:
2170            component_name = manifest["Component"][0].lower()
2171            table_name = component_name + "_synapse_storage_manifest_table"
2172        else:
2173            component_name = ""
2174            table_name = "synapse_storage_manifest_table"
2175        return table_name, component_name
2176
2177    def _create_entity_id(self, idx, row, manifest, datasetId):
2178        """Helper function to generate an entityId and add it to the appropriate row in the manifest.
2179        Args:
2180            row: current row of manifest being processed
2181            manifest (pd.DataFrame): loaded df containing user supplied data.
2182            datasetId (str): synapse ID of folder containing the dataset
2183
2184        Returns:
2185            manifest (pd.DataFrame): manifest with entityId added to the appropriate row
2186            entityId (str): Generated Entity Id.
2187
2188        """
2189        rowEntity = Folder(str(uuid.uuid4()), parent=datasetId)
2190        rowEntity = self.syn.store(rowEntity)
2191        entityId = rowEntity["id"]
2192        self.synapse_entity_tracker.add(synapse_id=entityId, entity=rowEntity)
2193        row["entityId"] = entityId
2194        manifest.loc[idx, "entityId"] = entityId
2195        return manifest, entityId
2196
2197    async def _process_store_annos(self, requests: Set[asyncio.Task]) -> None:
2198        """Process annotations and store them on synapse asynchronously
2199
2200        Args:
2201            requests (Set[asyncio.Task]): a set of tasks of formatting annotations created by format_row_annotations function in previous step
2202
2203        Raises:
2204            RuntimeError: raise a run time error if a task failed to complete
2205        """
2206        while requests:
2207            done_tasks, pending_tasks = await asyncio.wait(
2208                requests, return_when=asyncio.FIRST_COMPLETED
2209            )
2210            requests = pending_tasks
2211
2212            for completed_task in done_tasks:
2213                try:
2214                    annos = completed_task.result()
2215
2216                    if isinstance(annos, Annotations):
2217                        logger.info(f"Successfully stored annotations for {annos.id}")
2218                    else:
2219                        # store annotations if they are not None
2220                        if annos:
2221                            entity_id = annos["annotations"]["id"]
2222                            logger.info(
2223                                f"Obtained and processed annotations for {entity_id} entity"
2224                            )
2225                            requests.add(
2226                                asyncio.create_task(
2227                                    self.store_async_annotation(annotation_dict=annos)
2228                                )
2229                            )
2230                except Exception as e:
2231                    raise RuntimeError(f"failed with { repr(e) }.") from e
2232
2233    @tracer.start_as_current_span("SynapseStorage::add_annotations_to_entities_files")
2234    async def add_annotations_to_entities_files(
2235        self,
2236        dmge,
2237        manifest,
2238        manifest_record_type: str,
2239        datasetId: str,
2240        hideBlanks: bool,
2241        manifest_synapse_table_id="",
2242        annotation_keys: str = "class_label",
2243    ):
2244        """
2245        Depending on upload type add Ids to entityId row. Add anotations to connected
2246        files and folders. Despite the name of this function, it also applies to folders.
2247
2248        Args:
2249            dmge: DataModelGraphExplorer Object
2250            manifest (pd.DataFrame): loaded df containing user supplied data.
2251            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2252            datasetId (str): synapse ID of folder containing the dataset
2253            hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2254            manifest_synapse_table_id (str): Default is an empty string ''.
2255            annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display
2256                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2257                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2258        Returns:
2259            manifest (pd.DataFrame): modified to add entitiyId as appropriate
2260
2261        """
2262
2263        # Expected behavior is to annotate files if `Filename` is present and if file_annotations_upload is set to True regardless of `-mrt` setting
2264        if "filename" in [col.lower() for col in manifest.columns]:
2265            # get current list of files and store as dataframe
2266            dataset_files = self.getFilesInStorageDataset(datasetId)
2267            files_and_entityIds = self._get_file_entityIds(
2268                dataset_files=dataset_files, only_new_files=False
2269            )
2270            file_df = pd.DataFrame(files_and_entityIds)
2271
2272            # Merge dataframes to add entityIds
2273            manifest = manifest.merge(
2274                file_df, how="left", on="Filename", suffixes=["_x", None]
2275            ).drop("entityId_x", axis=1)
2276
2277        # Fill `entityId` for each row if missing and annotate entity as appropriate
2278        requests = set()
2279        for idx, row in manifest.iterrows():
2280            if not row["entityId"] and (
2281                manifest_record_type == "file_and_entities"
2282                or manifest_record_type == "table_file_and_entities"
2283            ):
2284                manifest, entityId = self._create_entity_id(
2285                    idx, row, manifest, datasetId
2286                )
2287            elif not row["entityId"] and manifest_record_type == "table_and_file":
2288                # If not using entityIds, fill with manifest_table_id so
2289                row["entityId"] = manifest_synapse_table_id
2290                manifest.loc[idx, "entityId"] = manifest_synapse_table_id
2291                entityId = ""
2292                # If the row is the manifest table, do not add annotations
2293            elif row["entityId"] == manifest_synapse_table_id:
2294                entityId = ""
2295            else:
2296                # get the file id of the file to annotate, collected in above step.
2297                entityId = row["entityId"]
2298
2299            # Adding annotations to connected files.
2300            if entityId:
2301                # Format annotations for Synapse
2302                annos_task = asyncio.create_task(
2303                    self.format_row_annotations(
2304                        dmge, row, entityId, hideBlanks, annotation_keys
2305                    )
2306                )
2307                requests.add(annos_task)
2308        await self._process_store_annos(requests)
2309        return manifest
2310
2311    @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_table")
2312    def upload_manifest_as_table(
2313        self,
2314        dmge: DataModelGraphExplorer,
2315        manifest: pd.DataFrame,
2316        metadataManifestPath: str,
2317        datasetId: str,
2318        table_name: str,
2319        component_name: str,
2320        restrict: bool,
2321        manifest_record_type: str,
2322        hideBlanks: bool,
2323        table_manipulation: str,
2324        table_column_names: str,
2325        annotation_keys: str,
2326        file_annotations_upload: bool = True,
2327    ):
2328        """Upload manifest to Synapse as a table and csv.
2329        Args:
2330            dmge: DataModelGraphExplorer object
2331            manifest (pd.DataFrame): loaded df containing user supplied data.
2332            metadataManifestPath: path to csv containing a validated metadata manifest.
2333            datasetId (str): synapse ID of folder containing the dataset
2334            table_name (str): Generated to name the table being uploaded.
2335            component_name (str): Name of the component manifest that is currently being uploaded.
2336            restrict (bool): Flag for censored data.
2337            manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2338            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2339            table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2340            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2341                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2342                display label formatting.
2343            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2344                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2345                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2346            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2347        Return:
2348            manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2349        """
2350        # Upload manifest as a table, get the ID and updated manifest.
2351        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2352            dmge=dmge,
2353            manifest=manifest,
2354            datasetId=datasetId,
2355            table_name=table_name,
2356            restrict=restrict,
2357            table_manipulation=table_manipulation,
2358            table_column_names=table_column_names,
2359        )
2360
2361        if file_annotations_upload:
2362            manifest = asyncio.run(
2363                self.add_annotations_to_entities_files(
2364                    dmge,
2365                    manifest,
2366                    manifest_record_type,
2367                    datasetId,
2368                    hideBlanks,
2369                    manifest_synapse_table_id,
2370                    annotation_keys,
2371                )
2372            )
2373        # Load manifest to synapse as a CSV File
2374        manifest_synapse_file_id = self.upload_manifest_file(
2375            manifest=manifest,
2376            metadataManifestPath=metadataManifestPath,
2377            datasetId=datasetId,
2378            restrict_manifest=restrict,
2379            component_name=component_name,
2380        )
2381
2382        # Set annotations for the file manifest.
2383        manifest_annotations = self.format_manifest_annotations(
2384            manifest=manifest, manifest_synapse_id=manifest_synapse_file_id
2385        )
2386        annos = self.syn.set_annotations(annotations=manifest_annotations)
2387        manifest_entity = self.synapse_entity_tracker.get(
2388            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2389        )
2390        manifest_entity.annotations = annos
2391        manifest_entity.etag = annos.etag
2392
2393        logger.info("Associated manifest file with dataset on Synapse.")
2394
2395        # Update manifest Synapse table with new entity id column.
2396        manifest_synapse_table_id, manifest, _ = self.uploadDB(
2397            dmge=dmge,
2398            manifest=manifest,
2399            datasetId=datasetId,
2400            table_name=table_name,
2401            restrict=restrict,
2402            table_manipulation="update",
2403            table_column_names=table_column_names,
2404        )
2405
2406        # Set annotations for the table manifest
2407        manifest_annotations = self.format_manifest_annotations(
2408            manifest=manifest, manifest_synapse_id=manifest_synapse_table_id
2409        )
2410        annotations_manifest_table = self.syn.set_annotations(
2411            annotations=manifest_annotations
2412        )
2413        manifest_table_entity = self.synapse_entity_tracker.get(
2414            synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False
2415        )
2416        manifest_table_entity.annotations = annotations_manifest_table
2417        manifest_table_entity.etag = annotations_manifest_table.etag
2418
2419        return manifest_synapse_file_id
2420
2421    @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_csv")
2422    def upload_manifest_as_csv(
2423        self,
2424        dmge,
2425        manifest,
2426        metadataManifestPath,
2427        datasetId,
2428        restrict,
2429        manifest_record_type,
2430        hideBlanks,
2431        component_name,
2432        annotation_keys: str,
2433        file_annotations_upload: bool = True,
2434    ):
2435        """Upload manifest to Synapse as a csv only.
2436        Args:
2437            dmge: DataModelGraphExplorer object
2438            manifest (pd.DataFrame): loaded df containing user supplied data.
2439            metadataManifestPath: path to csv containing a validated metadata manifest.
2440            datasetId (str): synapse ID of folder containing the dataset
2441            restrict (bool): Flag for censored data.
2442            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2443            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2444            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2445                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2446                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2447            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2448        Return:
2449            manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2450        """
2451        if file_annotations_upload:
2452            manifest = asyncio.run(
2453                self.add_annotations_to_entities_files(
2454                    dmge,
2455                    manifest,
2456                    manifest_record_type,
2457                    datasetId,
2458                    hideBlanks,
2459                    annotation_keys=annotation_keys,
2460                )
2461            )
2462
2463        # Load manifest to synapse as a CSV File
2464        manifest_synapse_file_id = self.upload_manifest_file(
2465            manifest,
2466            metadataManifestPath,
2467            datasetId,
2468            restrict,
2469            component_name=component_name,
2470        )
2471
2472        # Set annotations for the file manifest.
2473        manifest_annotations = self.format_manifest_annotations(
2474            manifest, manifest_synapse_file_id
2475        )
2476        annos = self.syn.set_annotations(manifest_annotations)
2477        manifest_entity = self.synapse_entity_tracker.get(
2478            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2479        )
2480        manifest_entity.annotations = annos
2481        manifest_entity.etag = annos.etag
2482
2483        logger.info("Associated manifest file with dataset on Synapse.")
2484
2485        return manifest_synapse_file_id
2486
2487    @tracer.start_as_current_span("SynapseStorage::upload_manifest_combo")
2488    def upload_manifest_combo(
2489        self,
2490        dmge,
2491        manifest,
2492        metadataManifestPath,
2493        datasetId,
2494        table_name,
2495        component_name,
2496        restrict,
2497        manifest_record_type,
2498        hideBlanks,
2499        table_manipulation,
2500        table_column_names: str,
2501        annotation_keys: str,
2502        file_annotations_upload: bool = True,
2503    ):
2504        """Upload manifest to Synapse as a table and CSV with entities.
2505        Args:
2506            dmge: DataModelGraphExplorer object
2507            manifest (pd.DataFrame): loaded df containing user supplied data.
2508            metadataManifestPath: path to csv containing a validated metadata manifest.
2509            datasetId (str): synapse ID of folder containing the dataset
2510            table_name (str): Generated to name the table being uploaded.
2511            component_name (str): Name of the component manifest that is currently being uploaded.
2512            restrict (bool): Flag for censored data.
2513            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2514            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2515            table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2516            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2517                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2518                display label formatting.
2519            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2520                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2521                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2522            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2523        Return:
2524            manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2525        """
2526        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2527            dmge=dmge,
2528            manifest=manifest,
2529            datasetId=datasetId,
2530            table_name=table_name,
2531            restrict=restrict,
2532            table_manipulation=table_manipulation,
2533            table_column_names=table_column_names,
2534        )
2535
2536        if file_annotations_upload:
2537            manifest = asyncio.run(
2538                self.add_annotations_to_entities_files(
2539                    dmge,
2540                    manifest,
2541                    manifest_record_type,
2542                    datasetId,
2543                    hideBlanks,
2544                    manifest_synapse_table_id,
2545                    annotation_keys=annotation_keys,
2546                )
2547            )
2548
2549        # Load manifest to synapse as a CSV File
2550        manifest_synapse_file_id = self.upload_manifest_file(
2551            manifest, metadataManifestPath, datasetId, restrict, component_name
2552        )
2553
2554        # Set annotations for the file manifest.
2555        manifest_annotations = self.format_manifest_annotations(
2556            manifest, manifest_synapse_file_id
2557        )
2558        file_manifest_annoations = self.syn.set_annotations(manifest_annotations)
2559        manifest_entity = self.synapse_entity_tracker.get(
2560            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2561        )
2562        manifest_entity.annotations = file_manifest_annoations
2563        manifest_entity.etag = file_manifest_annoations.etag
2564        logger.info("Associated manifest file with dataset on Synapse.")
2565
2566        # Update manifest Synapse table with new entity id column.
2567        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2568            dmge=dmge,
2569            manifest=manifest,
2570            datasetId=datasetId,
2571            table_name=table_name,
2572            restrict=restrict,
2573            table_manipulation="update",
2574            table_column_names=table_column_names,
2575        )
2576
2577        # Set annotations for the table manifest
2578        manifest_annotations = self.format_manifest_annotations(
2579            manifest, manifest_synapse_table_id
2580        )
2581        table_manifest_annotations = self.syn.set_annotations(manifest_annotations)
2582        manifest_entity = self.synapse_entity_tracker.get(
2583            synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False
2584        )
2585        manifest_entity.annotations = table_manifest_annotations
2586        manifest_entity.etag = table_manifest_annotations.etag
2587        return manifest_synapse_file_id
2588
2589    @tracer.start_as_current_span("SynapseStorage::associateMetadataWithFiles")
2590    def associateMetadataWithFiles(
2591        self,
2592        dmge: DataModelGraphExplorer,
2593        metadataManifestPath: str,
2594        datasetId: str,
2595        manifest_record_type: str = "table_file_and_entities",
2596        hideBlanks: bool = False,
2597        restrict_manifest=False,
2598        table_manipulation: str = "replace",
2599        table_column_names: str = "class_label",
2600        annotation_keys: str = "class_label",
2601        file_annotations_upload: bool = True,
2602    ) -> str:
2603        """Associate metadata with files in a storage dataset already on Synapse.
2604        Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file.
2605
2606        If this is a new manifest there could be no Synapse entities associated with the rows of this manifest
2607        this may be due to data type (e.g. clinical data) being tabular
2608        and not requiring files; to utilize uniform interfaces downstream
2609        (i.e. fileviews), a Synapse entity (a folder) is created for each row
2610        and an entity column is added to the manifest containing the resulting
2611        entity IDs; a table is also created at present as an additional interface
2612        for downstream query and interaction with the data.
2613
2614        Args:
2615            dmge: DataModelGraphExplorer Object
2616            metadataManifestPath: path to csv containing a validated metadata manifest.
2617            The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type.
2618            Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item.
2619            In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file.
2620            datasetId: synapse ID of folder containing the dataset
2621            manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination.
2622            hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2623            restrict_manifest (bool): Default is false. Flag for censored data.
2624            table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2625            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2626                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2627                display label formatting.
2628            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2629                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2630                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2631        Returns:
2632            manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2633        """
2634        # Read new manifest CSV:
2635        manifest = self._read_manifest(metadataManifestPath)
2636        manifest = self._add_id_columns_to_manifest(manifest, dmge)
2637
2638        table_name, component_name = self._generate_table_name(manifest)
2639
2640        # Upload manifest to synapse based on user input (manifest_record_type)
2641        if manifest_record_type == "file_only":
2642            manifest_synapse_file_id = self.upload_manifest_as_csv(
2643                dmge=dmge,
2644                manifest=manifest,
2645                metadataManifestPath=metadataManifestPath,
2646                datasetId=datasetId,
2647                restrict=restrict_manifest,
2648                hideBlanks=hideBlanks,
2649                manifest_record_type=manifest_record_type,
2650                component_name=component_name,
2651                annotation_keys=annotation_keys,
2652                file_annotations_upload=file_annotations_upload,
2653            )
2654        elif manifest_record_type == "table_and_file":
2655            manifest_synapse_file_id = self.upload_manifest_as_table(
2656                dmge=dmge,
2657                manifest=manifest,
2658                metadataManifestPath=metadataManifestPath,
2659                datasetId=datasetId,
2660                table_name=table_name,
2661                component_name=component_name,
2662                restrict=restrict_manifest,
2663                hideBlanks=hideBlanks,
2664                manifest_record_type=manifest_record_type,
2665                table_manipulation=table_manipulation,
2666                table_column_names=table_column_names,
2667                annotation_keys=annotation_keys,
2668                file_annotations_upload=file_annotations_upload,
2669            )
2670        elif manifest_record_type == "file_and_entities":
2671            manifest_synapse_file_id = self.upload_manifest_as_csv(
2672                dmge=dmge,
2673                manifest=manifest,
2674                metadataManifestPath=metadataManifestPath,
2675                datasetId=datasetId,
2676                restrict=restrict_manifest,
2677                hideBlanks=hideBlanks,
2678                manifest_record_type=manifest_record_type,
2679                component_name=component_name,
2680                annotation_keys=annotation_keys,
2681                file_annotations_upload=file_annotations_upload,
2682            )
2683        elif manifest_record_type == "table_file_and_entities":
2684            manifest_synapse_file_id = self.upload_manifest_combo(
2685                dmge=dmge,
2686                manifest=manifest,
2687                metadataManifestPath=metadataManifestPath,
2688                datasetId=datasetId,
2689                table_name=table_name,
2690                component_name=component_name,
2691                restrict=restrict_manifest,
2692                hideBlanks=hideBlanks,
2693                manifest_record_type=manifest_record_type,
2694                table_manipulation=table_manipulation,
2695                table_column_names=table_column_names,
2696                annotation_keys=annotation_keys,
2697                file_annotations_upload=file_annotations_upload,
2698            )
2699        else:
2700            raise ValueError("Please enter a valid manifest_record_type.")
2701        return manifest_synapse_file_id
2702
2703    def getTableAnnotations(self, table_id: str):
2704        """Generate dictionary of annotations for the given Synapse file.
2705        Synapse returns all custom annotations as lists since they
2706        can contain multiple values. In all cases, the values will
2707        be converted into strings and concatenated with ", ".
2708
2709        Args:
2710            fileId (str): Synapse ID for dataset file.
2711
2712        Returns:
2713            dict: Annotations as comma-separated strings.
2714        """
2715        try:
2716            entity = self.synapse_entity_tracker.get(
2717                synapse_id=table_id, syn=self.syn, download_file=False
2718            )
2719            is_table = entity.concreteType.endswith(".TableEntity")
2720            annotations_raw = entity.annotations
2721        except SynapseHTTPError:
2722            # If an error occurs with retrieving entity, skip it
2723            # This could be caused by a temporary file view that
2724            # was deleted since its ID was retrieved
2725            is_file, is_table = False, False
2726
2727        # Skip anything that isn't a file or folder
2728        if not (is_table):
2729            return None
2730
2731        annotations = self.getEntityAnnotations(table_id, entity, annotations_raw)
2732
2733        return annotations
2734
2735    def getFileAnnotations(self, fileId: str) -> Dict[str, str]:
2736        """Generate dictionary of annotations for the given Synapse file.
2737        Synapse returns all custom annotations as lists since they
2738        can contain multiple values. In all cases, the values will
2739        be converted into strings and concatenated with ", ".
2740
2741        Args:
2742            fileId (str): Synapse ID for dataset file.
2743
2744        Returns:
2745            dict: Annotations as comma-separated strings.
2746        """
2747
2748        # Get entity metadata, including annotations
2749        try:
2750            entity = self.synapse_entity_tracker.get(
2751                synapse_id=fileId, syn=self.syn, download_file=False
2752            )
2753            is_file = entity.concreteType.endswith(".FileEntity")
2754            is_folder = entity.concreteType.endswith(".Folder")
2755            annotations_raw = entity.annotations
2756        except SynapseHTTPError:
2757            # If an error occurs with retrieving entity, skip it
2758            # This could be caused by a temporary file view that
2759            # was deleted since its ID was retrieved
2760            is_file, is_folder = False, False
2761
2762        # Skip anything that isn't a file or folder
2763        if not (is_file or is_folder):
2764            return None
2765
2766        annotations = self.getEntityAnnotations(fileId, entity, annotations_raw)
2767
2768        return annotations
2769
2770    def getEntityAnnotations(self, fileId, entity, annotations_raw):
2771        # Extract annotations from their lists and stringify. For example:
2772        # {'YearofBirth': [1980], 'author': ['bruno', 'milen', 'sujay']}
2773        annotations = dict()
2774        for key, vals in annotations_raw.items():
2775            if isinstance(vals, list) and len(vals) == 1:
2776                annotations[key] = str(vals[0])
2777            else:
2778                annotations[key] = ", ".join(str(v) for v in vals)
2779
2780        # Add the file entity ID and eTag, which weren't lists
2781        assert fileId == entity.id, (
2782            "For some reason, the Synapse ID in the response doesn't match"
2783            "the Synapse ID sent in the request (via synapseclient)."
2784        )
2785        annotations["entityId"] = fileId
2786        annotations["eTag"] = entity.etag
2787
2788        return annotations
2789
2790    def getDatasetAnnotations(
2791        self, datasetId: str, fill_na: bool = True, force_batch: bool = False
2792    ) -> pd.DataFrame:
2793        """Generate table for annotations across all files in given dataset.
2794
2795        Args:
2796            datasetId (str): Synapse ID for dataset folder.
2797            fill_na (bool): Whether to replace missing values with
2798                blank strings.
2799            force_batch (bool): Whether to force the function to use
2800                the batch mode, which uses a file view to retrieve
2801                annotations for a given dataset. Default to False
2802                unless there are more than 50 files in the dataset.
2803
2804        Returns:
2805            pd.DataFrame: Table of annotations.
2806        """
2807        # Get all files in given dataset
2808        dataset_files = self.getFilesInStorageDataset(datasetId)
2809
2810        # if there are no dataset files, there are no annotations
2811        # return None
2812        if not dataset_files:
2813            return pd.DataFrame()
2814
2815        dataset_files_map = dict(dataset_files)
2816        dataset_file_ids, _ = list(zip(*dataset_files))
2817
2818        # Get annotations for each file from Step 1
2819        # Batch mode
2820        try_batch = len(dataset_files) >= 50 or force_batch
2821        if try_batch:
2822            try:
2823                logger.info("Trying batch mode for retrieving Synapse annotations")
2824                table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids)
2825            except (SynapseAuthenticationError, SynapseHTTPError, ValueError):
2826                logger.info(
2827                    f"Unable to create a temporary file view bound to {datasetId}. "
2828                    "Defaulting to slower iterative retrieval of annotations."
2829                )
2830                # Default to the slower non-batch method
2831                logger.info("Batch mode failed (probably due to permission error)")
2832                try_batch = False
2833
2834        # Non-batch mode
2835        if not try_batch:
2836            logger.info("Using slower (non-batch) sequential mode")
2837            records = [self.getFileAnnotations(i) for i in dataset_file_ids]
2838            # Remove any annotations for non-file/folders (stored as None)
2839            records = filter(None, records)
2840            table = pd.DataFrame.from_records(records)
2841
2842        # Add filenames for the files that "survived" annotation retrieval
2843        filenames = [dataset_files_map[i] for i in table["entityId"]]
2844
2845        if "Filename" not in table.columns:
2846            table.insert(0, "Filename", filenames)
2847
2848        # Ensure that entityId and eTag are at the end
2849        entity_ids = table.pop("entityId")
2850        etags = table.pop("eTag")
2851        table.insert(len(table.columns), "entityId", entity_ids)
2852        table.insert(len(table.columns), "eTag", etags)
2853
2854        # Missing values are filled in with empty strings for Google Sheets
2855        if fill_na:
2856            table.fillna("", inplace=True)
2857
2858        # Force all values as strings
2859        return table.astype(str)
2860
2861    def raise_final_error(retry_state):
2862        return retry_state.outcome.result()
2863
2864    def checkIfinAssetView(self, syn_id) -> str:
2865        # get data in administrative fileview for this pipeline
2866        assetViewTable = self.getStorageFileviewTable()
2867        all_files = list(assetViewTable["id"])
2868        if syn_id in all_files:
2869            return True
2870        else:
2871            return False
2872
2873    @tracer.start_as_current_span("SynapseStorage::getDatasetProject")
2874    @retry(
2875        stop=stop_after_attempt(5),
2876        wait=wait_chain(
2877            *[wait_fixed(10) for i in range(2)]
2878            + [wait_fixed(15) for i in range(2)]
2879            + [wait_fixed(20)]
2880        ),
2881        retry=retry_if_exception_type(LookupError),
2882        retry_error_callback=raise_final_error,
2883    )
2884    def getDatasetProject(self, datasetId: str) -> str:
2885        """Get parent project for a given dataset ID.
2886
2887        Args:
2888            datasetId (str): Synapse entity ID (folder or project).
2889
2890        Raises:
2891            ValueError: Raised if Synapse ID cannot be retrieved
2892            by the user or if it doesn't appear in the file view.
2893
2894        Returns:
2895            str: The Synapse ID for the parent project.
2896        """
2897
2898        # Subset main file view
2899        dataset_index = self.storageFileviewTable["id"] == datasetId
2900        dataset_row = self.storageFileviewTable[dataset_index]
2901
2902        # re-query if no datasets found
2903        if dataset_row.empty:
2904            sleep(5)
2905            self.query_fileview(force_requery=True)
2906            # Subset main file view
2907            dataset_index = self.storageFileviewTable["id"] == datasetId
2908            dataset_row = self.storageFileviewTable[dataset_index]
2909
2910        # Return `projectId` for given row if only one found
2911        if len(dataset_row) == 1:
2912            dataset_project = dataset_row["projectId"].values[0]
2913            return dataset_project
2914
2915        # Otherwise, check if already project itself
2916        try:
2917            syn_object = self.synapse_entity_tracker.get(
2918                synapse_id=datasetId, syn=self.syn, download_file=False
2919            )
2920            if syn_object.properties["concreteType"].endswith("Project"):
2921                return datasetId
2922        except SynapseHTTPError:
2923            raise PermissionError(
2924                f"The given dataset ({datasetId}) isn't accessible with this "
2925                "user. This might be caused by a typo in the dataset Synapse ID."
2926            )
2927
2928        # If not, then assume dataset not in file view
2929        raise LookupError(
2930            f"The given dataset ({datasetId}) doesn't appear in the "
2931            f"configured file view ({self.storageFileview}). This might "
2932            "mean that the file view's scope needs to be updated."
2933        )
2934
2935    def getDatasetAnnotationsBatch(
2936        self, datasetId: str, dataset_file_ids: Sequence[str] = None
2937    ) -> pd.DataFrame:
2938        """Generate table for annotations across all files in given dataset.
2939        This function uses a temporary file view to generate a table
2940        instead of iteratively querying for individual entity annotations.
2941        This function is expected to run much faster than
2942        `self.getDatasetAnnotationsBatch` on large datasets.
2943
2944        Args:
2945            datasetId (str): Synapse ID for dataset folder.
2946            dataset_file_ids (Sequence[str]): List of Synapse IDs
2947                for dataset files/folders used to subset the table.
2948
2949        Returns:
2950            pd.DataFrame: Table of annotations.
2951        """
2952        # Create data frame from annotations file view
2953        with DatasetFileView(datasetId, self.syn) as fileview:
2954            table = fileview.query()
2955
2956        if dataset_file_ids:
2957            table = table.loc[table.index.intersection(dataset_file_ids)]
2958
2959        table = table.reset_index(drop=True)
2960
2961        return table
2962
2963    def _get_table_schema_by_cname(self, table_schema):
2964        # assume no duplicate column names in the table
2965        table_schema_by_cname = {}
2966
2967        for col_record in table_schema:
2968            # TODO clean up dictionary for compactness (e.g. remove redundant 'name' key)
2969            table_schema_by_cname[col_record["name"]] = col_record
2970
2971        return table_schema_by_cname
2972
2973
2974class TableOperations:
2975    """
2976    Object to hold functions for various table operations specific to the Synapse Asset Store.
2977
2978    Currently implement operations are:
2979    createTable: upload a manifest as a new table when none exist
2980    replaceTable: replace a metadata in a table from one manifest with metadata from another manifest
2981    updateTable: add a column to a table that already exists on synapse
2982
2983    Operations currently in development are:
2984    upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest
2985    """
2986
2987    def __init__(
2988        self,
2989        synStore: SynapseStorage,
2990        tableToLoad: pd.DataFrame = None,
2991        tableName: str = None,
2992        datasetId: str = None,
2993        existingTableId: str = None,
2994        restrict: bool = False,
2995        synapse_entity_tracker: SynapseEntityTracker = None,
2996    ):
2997        """
2998        Class governing table operations (creation, replacement, upserts, updates) in schematic
2999
3000        tableToLoad: manifest formatted appropriately for the table
3001        tableName: name of the table to be uploaded
3002        datasetId: synID of the dataset for the manifest
3003        existingTableId: synId of the table currently exising on synapse (if there is one)
3004        restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
3005        synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
3006
3007        """
3008        self.synStore = synStore
3009        self.tableToLoad = tableToLoad
3010        self.tableName = tableName
3011        self.datasetId = datasetId
3012        self.existingTableId = existingTableId
3013        self.restrict = restrict
3014        self.synapse_entity_tracker = synapse_entity_tracker or SynapseEntityTracker()
3015
3016    @tracer.start_as_current_span("TableOperations::createTable")
3017    def createTable(
3018        self,
3019        columnTypeDict: dict = None,
3020        specifySchema: bool = True,
3021    ):
3022        """
3023        Method to create a table from a metadata manifest and upload it to synapse
3024
3025        Args:
3026            columnTypeDict: dictionary schema for table columns: type, size, etc
3027            specifySchema: to specify a specific schema for the table format
3028
3029        Returns:
3030            table.schema.id: synID of the newly created table
3031        """
3032        datasetEntity = self.synapse_entity_tracker.get(
3033            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3034        )
3035        datasetName = datasetEntity.name
3036        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3037
3038        if not self.tableName:
3039            self.tableName = datasetName + "table"
3040        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3041        if specifySchema:
3042            if columnTypeDict == {}:
3043                logger.error("Did not provide a columnTypeDict.")
3044            # create list of columns:
3045            cols = []
3046            for col in self.tableToLoad.columns:
3047                if col in table_schema_by_cname:
3048                    col_type = table_schema_by_cname[col]["columnType"]
3049                    max_size = (
3050                        table_schema_by_cname[col]["maximumSize"]
3051                        if "maximumSize" in table_schema_by_cname[col].keys()
3052                        else 100
3053                    )
3054                    max_list_len = 250
3055                    if max_size and max_list_len:
3056                        cols.append(
3057                            Column(
3058                                name=col,
3059                                columnType=col_type,
3060                                maximumSize=max_size,
3061                                maximumListLength=max_list_len,
3062                            )
3063                        )
3064                    elif max_size:
3065                        cols.append(
3066                            Column(name=col, columnType=col_type, maximumSize=max_size)
3067                        )
3068                    else:
3069                        cols.append(Column(name=col, columnType=col_type))
3070                else:
3071                    # TODO add warning that the given col was not found and it's max size is set to 100
3072                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3073            schema = Schema(
3074                name=self.tableName, columns=cols, parent=datasetParentProject
3075            )
3076            table = Table(schema, self.tableToLoad)
3077            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3078            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3079            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3080            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3081            return table.schema.id
3082        else:
3083            # For just uploading the tables to synapse using default
3084            # column types.
3085            table = build_table(self.tableName, datasetParentProject, self.tableToLoad)
3086            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3087            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3088            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3089            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3090            return table.schema.id
3091
3092    @tracer.start_as_current_span("TableOperations::replaceTable")
3093    def replaceTable(
3094        self,
3095        specifySchema: bool = True,
3096        columnTypeDict: dict = None,
3097    ):
3098        """
3099        Method to replace an existing table on synapse with metadata from a new manifest
3100
3101        Args:
3102            specifySchema: to infer a schema for the table format
3103            columnTypeDict: dictionary schema for table columns: type, size, etc
3104
3105        Returns:
3106           existingTableId: synID of the already existing table that had its metadata replaced
3107        """
3108        datasetEntity = self.synapse_entity_tracker.get(
3109            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3110        )
3111
3112        datasetName = datasetEntity.name
3113        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3114        existing_table, existing_results = self.synStore.get_synapse_table(
3115            self.existingTableId
3116        )
3117        # remove rows
3118        self.synStore.syn.delete(existing_results)
3119        # Data changes such as removing all rows causes the eTag to change.
3120        self.synapse_entity_tracker.remove(synapse_id=self.existingTableId)
3121        # wait for row deletion to finish on synapse before getting empty table
3122        sleep(10)
3123
3124        # removes all current columns
3125        current_table = self.synapse_entity_tracker.get(
3126            synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False
3127        )
3128
3129        current_columns = self.synStore.syn.getTableColumns(current_table)
3130        for col in current_columns:
3131            current_table.removeColumn(col)
3132
3133        if not self.tableName:
3134            self.tableName = datasetName + "table"
3135
3136        # Process columns according to manifest entries
3137        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3138        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3139        if specifySchema:
3140            if columnTypeDict == {}:
3141                logger.error("Did not provide a columnTypeDict.")
3142            # create list of columns:
3143            cols = []
3144
3145            for col in self.tableToLoad.columns:
3146                if col in table_schema_by_cname:
3147                    col_type = table_schema_by_cname[col]["columnType"]
3148                    max_size = (
3149                        table_schema_by_cname[col]["maximumSize"]
3150                        if "maximumSize" in table_schema_by_cname[col].keys()
3151                        else 100
3152                    )
3153                    max_list_len = 250
3154                    if max_size and max_list_len:
3155                        cols.append(
3156                            Column(
3157                                name=col,
3158                                columnType=col_type,
3159                                maximumSize=max_size,
3160                                maximumListLength=max_list_len,
3161                            )
3162                        )
3163                    elif max_size:
3164                        cols.append(
3165                            Column(name=col, columnType=col_type, maximumSize=max_size)
3166                        )
3167                    else:
3168                        cols.append(Column(name=col, columnType=col_type))
3169                else:
3170                    # TODO add warning that the given col was not found and it's max size is set to 100
3171                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3172
3173            # adds new columns to schema
3174            for col in cols:
3175                current_table.addColumn(col)
3176            table_result = self.synStore.syn.store(
3177                current_table, isRestricted=self.restrict
3178            )
3179            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3180            # self.synapse_entity_tracker.add(synapse_id=table_result.schema.id, entity=table_result.schema)
3181            self.synapse_entity_tracker.remove(synapse_id=table_result.id)
3182
3183            # wait for synapse store to finish
3184            sleep(1)
3185
3186            # build schema and table from columns and store with necessary restrictions
3187            schema = Schema(
3188                name=self.tableName, columns=cols, parent=datasetParentProject
3189            )
3190            schema.id = self.existingTableId
3191            table = Table(schema, self.tableToLoad, etag=existing_results.etag)
3192            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3193            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3194            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3195            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3196        else:
3197            logging.error("Must specify a schema for table replacements")
3198
3199        # remove system metadata from manifest
3200        existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True)
3201        return self.existingTableId
3202
3203    @tracer.start_as_current_span("TableOperations::_get_auth_token")
3204    def _get_auth_token(
3205        self,
3206    ):
3207        authtoken = None
3208
3209        # Get access token from environment variable if available
3210        # Primarily useful for testing environments, with other possible usefulness for containers
3211        env_access_token = os.getenv("SYNAPSE_ACCESS_TOKEN")
3212        if env_access_token:
3213            authtoken = env_access_token
3214            return authtoken
3215
3216        # Get token from authorization header
3217        # Primarily useful for API endpoint functionality
3218        if "Authorization" in self.synStore.syn.default_headers:
3219            authtoken = self.synStore.syn.default_headers["Authorization"].split(
3220                "Bearer "
3221            )[-1]
3222            return authtoken
3223
3224        # retrive credentials from synapse object
3225        # Primarily useful for local users, could only be stored here when a .synapseConfig file is used, but including to be safe
3226        synapse_object_creds = self.synStore.syn.credentials
3227        if hasattr(synapse_object_creds, "_token"):
3228            authtoken = synapse_object_creds.secret
3229
3230        # Try getting creds from .synapseConfig file if it exists
3231        # Primarily useful for local users. Seems to correlate with credentials stored in synaspe object when logged in
3232        if os.path.exists(CONFIG.synapse_configuration_path):
3233            config = get_config_file(CONFIG.synapse_configuration_path)
3234
3235            # check which credentials are provided in file
3236            if config.has_option("authentication", "authtoken"):
3237                authtoken = config.get("authentication", "authtoken")
3238
3239        # raise error if required credentials are not found
3240        if not authtoken:
3241            raise NameError(
3242                "authtoken credentials could not be found in the environment, synapse object, or the .synapseConfig file"
3243            )
3244
3245        return authtoken
3246
3247    @tracer.start_as_current_span("TableOperations::upsertTable")
3248    def upsertTable(self, dmge: DataModelGraphExplorer):
3249        """
3250        Method to upsert rows from a new manifest into an existing table on synapse
3251        For upsert functionality to work, primary keys must follow the naming convention of <componenet>_id
3252        `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality.
3253        Currently it is required to use -dl/--use_display_label with table upserts.
3254
3255
3256        Args:
3257            dmge: DataModelGraphExplorer instance
3258
3259        Returns:
3260           existingTableId: synID of the already existing table that had its metadata replaced
3261        """
3262
3263        authtoken = self._get_auth_token()
3264
3265        synapseDB = SynapseDatabase(
3266            auth_token=authtoken,
3267            project_id=self.synStore.getDatasetProject(self.datasetId),
3268            syn=self.synStore.syn,
3269            synapse_entity_tracker=self.synapse_entity_tracker,
3270        )
3271
3272        try:
3273            # Try performing upsert
3274            synapseDB.upsert_table_rows(
3275                table_name=self.tableName, data=self.tableToLoad
3276            )
3277        except SynapseHTTPError as ex:
3278            # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload
3279            if "Id is not a valid column name or id" in str(ex):
3280                self._update_table_uuid_column(dmge)
3281                synapseDB.upsert_table_rows(
3282                    table_name=self.tableName, data=self.tableToLoad
3283                )
3284            # Raise if other error
3285            else:
3286                raise ex
3287
3288        return self.existingTableId
3289
3290    @tracer.start_as_current_span("TableOperations::_update_table_uuid_column")
3291    def _update_table_uuid_column(
3292        self,
3293        dmge: DataModelGraphExplorer,
3294    ) -> None:
3295        """Removes the `Uuid` column when present, and relpaces with an `Id` column
3296        Used to enable backwards compatability for manifests using the old `Uuid` convention
3297
3298        Args:
3299            dmge: DataModelGraphExplorer instance
3300
3301        Returns:
3302            None
3303        """
3304
3305        # Get the columns of the schema
3306        schema = self.synapse_entity_tracker.get(
3307            synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False
3308        )
3309
3310        cols = self.synStore.syn.getTableColumns(schema)
3311
3312        # Iterate through columns until `Uuid` column is found
3313        for col in cols:
3314            if col.name.lower() == "uuid":
3315                # See if schema has `Uuid` column specified
3316                try:
3317                    uuid_col_in_schema = dmge.is_class_in_schema(col.name)
3318                except KeyError:
3319                    uuid_col_in_schema = False
3320
3321                # If there is, then create a new `Id` column from scratch
3322                if uuid_col_in_schema:
3323                    new_col = Column(columnType="STRING", maximumSize=64, name="Id")
3324                    schema.addColumn(new_col)
3325                    schema = self.synStore.syn.store(schema)
3326                    # self.synapse_entity_tracker.add(synapse_id=schema.id, entity=schema)
3327                    # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3328                    self.synapse_entity_tracker.remove(synapse_id=schema.id)
3329                # If there is not, then use the old `Uuid` column as a basis for the new `Id` column
3330                else:
3331                    # Build ColumnModel that will be used for new column
3332                    id_column = Column(
3333                        name="Id",
3334                        columnType="STRING",
3335                        maximumSize=64,
3336                        defaultValue=None,
3337                        maximumListLength=1,
3338                    )
3339                    new_col_response = self.synStore.syn.store(id_column)
3340
3341                    # Define columnChange body
3342                    columnChangeDict = {
3343                        "concreteType": "org.sagebionetworks.repo.model.table.TableSchemaChangeRequest",
3344                        "entityId": self.existingTableId,
3345                        "changes": [
3346                            {
3347                                "oldColumnId": col["id"],
3348                                "newColumnId": new_col_response["id"],
3349                            }
3350                        ],
3351                    }
3352
3353                    self.synStore.syn._async_table_update(
3354                        table=self.existingTableId,
3355                        changes=[columnChangeDict],
3356                        wait=False,
3357                    )
3358                break
3359
3360        return
3361
3362    @tracer.start_as_current_span("TableOperations::updateTable")
3363    def updateTable(
3364        self,
3365        update_col: str = "Id",
3366    ):
3367        """
3368        Method to update an existing table with a new column
3369
3370        Args:
3371            updateCol: column to index the old and new tables on
3372
3373        Returns:
3374           existingTableId: synID of the already existing table that had its metadata replaced
3375        """
3376        existing_table, existing_results = self.synStore.get_synapse_table(
3377            self.existingTableId
3378        )
3379
3380        self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col)
3381        # store table with existing etag data and impose restrictions as appropriate
3382        table_result = self.synStore.syn.store(
3383            Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag),
3384            isRestricted=self.restrict,
3385        )
3386        # We cannot store the Table to the `synapse_entity_tracker` because there is
3387        # not `Schema` on the table object. The above `.store()` function call would
3388        # also update the ETag of the entity within Synapse. Remove it from the tracker
3389        # and re-retrieve it later on if needed again.
3390        self.synapse_entity_tracker.remove(synapse_id=table_result.tableId)
3391
3392        return self.existingTableId
3393
3394
3395class DatasetFileView:
3396    """Helper class to create temporary dataset file views.
3397    This class can be used in conjunction with a 'with' statement.
3398    This will ensure that the file view is deleted automatically.
3399    See SynapseStorage.getDatasetAnnotationsBatch for example usage.
3400    """
3401
3402    def __init__(
3403        self,
3404        datasetId: str,
3405        synapse: Synapse,
3406        name: str = None,
3407        temporary: bool = True,
3408        parentId: str = None,
3409    ) -> None:
3410        """Create a file view scoped to a dataset folder.
3411
3412        Args:
3413            datasetId (str): Synapse ID for a dataset folder/project.
3414            synapse (Synapse): Used for Synapse requests.
3415            name (str): Name of the file view (temporary or not).
3416            temporary (bool): Whether to delete the file view on exit
3417                of either a 'with' statement or Python entirely.
3418            parentId (str, optional): Synapse ID specifying where to
3419                store the file view. Defaults to datasetId.
3420        """
3421
3422        self.datasetId = datasetId
3423        self.synapse = synapse
3424        self.is_temporary = temporary
3425
3426        if name is None:
3427            self.name = f"schematic annotation file view for {self.datasetId}"
3428
3429        if self.is_temporary:
3430            uid = secrets.token_urlsafe(5)
3431            self.name = f"{self.name} - UID {uid}"
3432
3433        # TODO: Allow a DCC admin to configure a "universal parent"
3434        #       Such as a Synapse project writeable by everyone.
3435        self.parentId = datasetId if parentId is None else parentId
3436
3437        # TODO: Create local sharing setting to hide from everyone else
3438        view_schema = EntityViewSchema(
3439            name=self.name,
3440            parent=self.parentId,
3441            scopes=self.datasetId,
3442            includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER],
3443            addDefaultViewColumns=False,
3444            addAnnotationColumns=True,
3445        )
3446
3447        # TODO: Handle failure due to insufficient permissions by
3448        #       creating a temporary new project to store view
3449        self.view_schema = self.synapse.store(view_schema)
3450
3451        # These are filled in after calling `self.query()`
3452        self.results = None
3453        self.table = None
3454
3455        # Ensure deletion of the file view (last resort)
3456        if self.is_temporary:
3457            atexit.register(self.delete)
3458
3459    def __enter__(self):
3460        """Return file view when entering 'with' statement."""
3461        return self
3462
3463    def __exit__(self, exc_type, exc_value, traceback):
3464        """Delete file view when exiting 'with' statement."""
3465        if self.is_temporary:
3466            self.delete()
3467
3468    def delete(self):
3469        """Delete the file view on Synapse without deleting local table."""
3470        if self.view_schema is not None:
3471            self.synapse.delete(self.view_schema)
3472            self.view_schema = None
3473
3474    def query(self, tidy=True, force=False):
3475        """Retrieve file view as a data frame (raw format sans index)."""
3476        if self.table is None or force:
3477            fileview_id = self.view_schema["id"]
3478            self.results = self.synapse.tableQuery(f"select * from {fileview_id}")
3479            self.table = self.results.asDataFrame(
3480                rowIdAndVersionInIndex=False,
3481                na_values=STR_NA_VALUES_FILTERED,
3482                keep_default_na=False,
3483            )
3484        if tidy:
3485            self.tidy_table()
3486        return self.table
3487
3488    def tidy_table(self):
3489        """Convert raw file view data frame into more usable format."""
3490        assert self.table is not None, "Must call `self.query()` first."
3491        self._fix_default_columns()
3492        self._fix_list_columns()
3493        self._fix_int_columns()
3494        return self.table
3495
3496    def _fix_default_columns(self):
3497        """Rename default columns to match schematic expectations."""
3498
3499        # Drop ROW_VERSION column if present
3500        if "ROW_VERSION" in self.table:
3501            del self.table["ROW_VERSION"]
3502
3503        # Rename id column to entityId and set as data frame index
3504        if "ROW_ID" in self.table:
3505            self.table["entityId"] = "syn" + self.table["ROW_ID"].astype(str)
3506            self.table = self.table.set_index("entityId", drop=False)
3507            del self.table["ROW_ID"]
3508
3509        # Rename ROW_ETAG column to eTag and place at end of data frame
3510        if "ROW_ETAG" in self.table:
3511            row_etags = self.table.pop("ROW_ETAG")
3512
3513            # eTag column may already present if users annotated data without submitting manifest
3514            # we're only concerned with the new values and not the existing ones
3515            if "eTag" in self.table:
3516                del self.table["eTag"]
3517
3518            self.table.insert(len(self.table.columns), "eTag", row_etags)
3519
3520        return self.table
3521
3522    def _get_columns_of_type(self, types):
3523        """Helper function to get list of columns of a given type(s)."""
3524        matching_columns = []
3525        for header in self.results.headers:
3526            if header.columnType in types:
3527                matching_columns.append(header.name)
3528        return matching_columns
3529
3530    def _fix_list_columns(self):
3531        """Fix formatting of list-columns."""
3532        list_types = {"STRING_LIST", "INTEGER_LIST", "BOOLEAN_LIST"}
3533        list_columns = self._get_columns_of_type(list_types)
3534        for col in list_columns:
3535            self.table[col] = self.table[col].apply(lambda x: ", ".join(x))
3536        return self.table
3537
3538    def _fix_int_columns(self):
3539        """Ensure that integer-columns are actually integers."""
3540        int_columns = self._get_columns_of_type({"INTEGER"})
3541        for col in int_columns:
3542            # Coercing to string because NaN is a floating point value
3543            # and cannot exist alongside integers in a column
3544            def to_int_fn(x):
3545                return "" if np.isnan(x) else str(int(x))
3546
3547            self.table[col] = self.table[col].apply(to_int_fn)
3548        return self.table
logger = <Logger Synapse storage (WARNING)>
tracer = <opentelemetry.sdk.trace.Tracer object>
@dataclass
class ManifestDownload:
 86@dataclass
 87class ManifestDownload(object):
 88    """
 89    syn: an object of type synapseclient.
 90    manifest_id: id of a manifest
 91    synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
 92    """
 93
 94    syn: synapseclient.Synapse
 95    manifest_id: str
 96    synapse_entity_tracker: SynapseEntityTracker = field(
 97        default_factory=SynapseEntityTracker
 98    )
 99
100    def _download_manifest_to_folder(self, use_temporary_folder: bool = True) -> File:
101        """
102        Try downloading a manifest to a specific folder (temporary or not). When the
103        `use_temporary_folder` is set to True, the manifest will be downloaded to a
104        temporary folder. This is useful for when the code is running as an API server
105        where multiple requests are being made at the same time. This will prevent
106        multiple requests from overwriting the same manifest file. When the
107        `use_temporary_folder` is set to False, the manifest will be downloaded to the
108        default manifest folder.
109
110        Args:
111            use_temporary_folder: boolean argument indicating if a temporary folder
112                should be used to store the manifest file. This is useful when running
113                this code as an API server where multiple requests could be made at the
114                same time. This is set to False when the code is being used from the
115                CLI. Defaults to True.
116
117        Return:
118            manifest_data: A Synapse file entity of the downloaded manifest
119        """
120        manifest_data = self.synapse_entity_tracker.get(
121            synapse_id=self.manifest_id,
122            syn=self.syn,
123            download_file=False,
124            retrieve_if_not_present=False,
125        )
126        current_span = trace.get_current_span()
127        if (
128            manifest_data
129            and (file_handle := manifest_data.get("_file_handle", None))
130            and current_span.is_recording()
131        ):
132            current_span.set_attribute(
133                "schematic.manifest_size", file_handle.get("contentSize", 0)
134            )
135
136        if manifest_data and manifest_data.path:
137            return manifest_data
138
139        if "SECRETS_MANAGER_SECRETS" in os.environ:
140            temporary_manifest_storage = "/var/tmp/temp_manifest_download"
141            cleanup_temporary_storage(
142                temporary_manifest_storage, time_delta_seconds=3600
143            )
144            # create a new directory to store manifest
145            if not os.path.exists(temporary_manifest_storage):
146                os.mkdir(temporary_manifest_storage)
147            # create temporary folders for storing manifests
148            download_location = create_temp_folder(
149                path=temporary_manifest_storage,
150                prefix=f"{self.manifest_id}-{time.time()}-",
151            )
152        else:
153            if use_temporary_folder:
154                download_location = create_temp_folder(
155                    path=CONFIG.manifest_folder,
156                    prefix=f"{self.manifest_id}-{time.time()}-",
157                )
158            else:
159                download_location = CONFIG.manifest_folder
160
161        manifest_data = self.synapse_entity_tracker.get(
162            synapse_id=self.manifest_id,
163            syn=self.syn,
164            download_file=True,
165            retrieve_if_not_present=True,
166            download_location=download_location,
167        )
168
169        # This is doing a rename of the downloaded file. The reason this is important
170        # is that if we are re-using a file that was previously downloaded, but the
171        # file had been renamed. The file downloaded from the Synapse client is just
172        # a direct copy of that renamed file. This code will set the name of the file
173        # to the original name that was used to download the file. Note: An MD5 checksum
174        # of the file will still be performed so if the file has changed, it will be
175        # downloaded again.
176        filename = manifest_data._file_handle.fileName
177        if filename != os.path.basename(manifest_data.path):
178            parent_folder = os.path.dirname(manifest_data.path)
179            manifest_original_name_and_path = os.path.join(parent_folder, filename)
180
181            self.syn.cache.remove(
182                file_handle_id=manifest_data.dataFileHandleId, path=manifest_data.path
183            )
184            os.rename(manifest_data.path, manifest_original_name_and_path)
185            manifest_data.path = manifest_original_name_and_path
186            self.syn.cache.add(
187                file_handle_id=manifest_data.dataFileHandleId,
188                path=manifest_original_name_and_path,
189                md5=manifest_data._file_handle.contentMd5,
190            )
191
192        return manifest_data
193
194    def _entity_type_checking(self) -> str:
195        """
196        check the entity type of the id that needs to be downloaded
197        Return:
198             if the entity type is wrong, raise an error
199        """
200        # check the type of entity
201        entity_type = entity_type_mapping(
202            syn=self.syn,
203            entity_id=self.manifest_id,
204            synapse_entity_tracker=self.synapse_entity_tracker,
205        )
206        if entity_type != "file":
207            logger.error(
208                f"You are using entity type: {entity_type}. Please provide a file ID"
209            )
210
211    def download_manifest(
212        self,
213        newManifestName: str = "",
214        manifest_df: pd.DataFrame = pd.DataFrame(),
215        use_temporary_folder: bool = True,
216    ) -> Union[str, File]:
217        """
218        Download a manifest based on a given manifest id.
219        Args:
220            newManifestName(optional): new name of a manifest that gets downloaded.
221            manifest_df(optional): a dataframe containing name and id of manifests in a given asset view
222        Return:
223            manifest_data: synapse entity file object
224        """
225
226        # enables retrying if user does not have access to uncensored manifest
227        # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location
228        manifest_data = ""
229
230        # check entity type
231        self._entity_type_checking()
232
233        # download a manifest
234        try:
235            manifest_data = self._download_manifest_to_folder(
236                use_temporary_folder=use_temporary_folder
237            )
238        except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError):
239            # if there's an error getting an uncensored manifest, try getting the censored manifest
240            if not manifest_df.empty:
241                censored_regex = re.compile(".*censored.*")
242                censored = manifest_df["name"].str.contains(censored_regex)
243                new_manifest_id = manifest_df[censored]["id"][0]
244                self.manifest_id = new_manifest_id
245                try:
246                    manifest_data = self._download_manifest_to_folder(
247                        use_temporary_folder=use_temporary_folder
248                    )
249                except (
250                    SynapseUnmetAccessRestrictions,
251                    SynapseAuthenticationError,
252                ) as e:
253                    raise PermissionError(
254                        "You don't have access to censored and uncensored manifests in this dataset."
255                    ) from e
256            else:
257                logger.error(
258                    f"You don't have access to the requested resource: {self.manifest_id}"
259                )
260
261        if newManifestName and os.path.exists(manifest_data.get("path")):
262            # Rename the file we just made to the new name
263            new_manifest_filename = newManifestName + ".csv"
264
265            # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest.
266            parent_folder = os.path.dirname(manifest_data.get("path"))
267
268            new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename)
269
270            # Copy file to new location. The purpose of using a copy instead of a rename
271            # is to avoid any potential issues with the file being used in another
272            # process. This avoids any potential race or code cocurrency conditions.
273            shutil.copyfile(src=manifest_data["path"], dst=new_manifest_path_name)
274
275            # Adding this to cache will allow us to re-use the already downloaded
276            # manifest file for up to 1 hour.
277            self.syn.cache.add(
278                file_handle_id=manifest_data.dataFileHandleId,
279                path=new_manifest_path_name,
280                md5=manifest_data._file_handle.contentMd5,
281            )
282
283            # Update file names/paths in manifest_data
284            manifest_data["name"] = new_manifest_filename
285            manifest_data["filename"] = new_manifest_filename
286            manifest_data["path"] = new_manifest_path_name
287
288        return manifest_data

syn: an object of type synapseclient. manifest_id: id of a manifest synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities

ManifestDownload( syn: synapseclient.client.Synapse, manifest_id: str, synapse_entity_tracker: schematic.store.synapse_tracker.SynapseEntityTracker = <factory>)
syn: synapseclient.client.Synapse
manifest_id: str
def download_manifest( self, newManifestName: str = '', manifest_df: pandas.core.frame.DataFrame = Empty DataFrame Columns: [] Index: [], use_temporary_folder: bool = True) -> Union[str, synapseclient.entity.File]:
211    def download_manifest(
212        self,
213        newManifestName: str = "",
214        manifest_df: pd.DataFrame = pd.DataFrame(),
215        use_temporary_folder: bool = True,
216    ) -> Union[str, File]:
217        """
218        Download a manifest based on a given manifest id.
219        Args:
220            newManifestName(optional): new name of a manifest that gets downloaded.
221            manifest_df(optional): a dataframe containing name and id of manifests in a given asset view
222        Return:
223            manifest_data: synapse entity file object
224        """
225
226        # enables retrying if user does not have access to uncensored manifest
227        # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location
228        manifest_data = ""
229
230        # check entity type
231        self._entity_type_checking()
232
233        # download a manifest
234        try:
235            manifest_data = self._download_manifest_to_folder(
236                use_temporary_folder=use_temporary_folder
237            )
238        except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError):
239            # if there's an error getting an uncensored manifest, try getting the censored manifest
240            if not manifest_df.empty:
241                censored_regex = re.compile(".*censored.*")
242                censored = manifest_df["name"].str.contains(censored_regex)
243                new_manifest_id = manifest_df[censored]["id"][0]
244                self.manifest_id = new_manifest_id
245                try:
246                    manifest_data = self._download_manifest_to_folder(
247                        use_temporary_folder=use_temporary_folder
248                    )
249                except (
250                    SynapseUnmetAccessRestrictions,
251                    SynapseAuthenticationError,
252                ) as e:
253                    raise PermissionError(
254                        "You don't have access to censored and uncensored manifests in this dataset."
255                    ) from e
256            else:
257                logger.error(
258                    f"You don't have access to the requested resource: {self.manifest_id}"
259                )
260
261        if newManifestName and os.path.exists(manifest_data.get("path")):
262            # Rename the file we just made to the new name
263            new_manifest_filename = newManifestName + ".csv"
264
265            # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest.
266            parent_folder = os.path.dirname(manifest_data.get("path"))
267
268            new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename)
269
270            # Copy file to new location. The purpose of using a copy instead of a rename
271            # is to avoid any potential issues with the file being used in another
272            # process. This avoids any potential race or code cocurrency conditions.
273            shutil.copyfile(src=manifest_data["path"], dst=new_manifest_path_name)
274
275            # Adding this to cache will allow us to re-use the already downloaded
276            # manifest file for up to 1 hour.
277            self.syn.cache.add(
278                file_handle_id=manifest_data.dataFileHandleId,
279                path=new_manifest_path_name,
280                md5=manifest_data._file_handle.contentMd5,
281            )
282
283            # Update file names/paths in manifest_data
284            manifest_data["name"] = new_manifest_filename
285            manifest_data["filename"] = new_manifest_filename
286            manifest_data["path"] = new_manifest_path_name
287
288        return manifest_data

Download a manifest based on a given manifest id.

Arguments:
  • newManifestName(optional): new name of a manifest that gets downloaded.
  • manifest_df(optional): a dataframe containing name and id of manifests in a given asset view
Return:

manifest_data: synapse entity file object

class SynapseStorage(schematic.store.base.BaseStorage):
 291class SynapseStorage(BaseStorage):
 292    """Implementation of Storage interface for datasets/files stored on Synapse.
 293    Provides utilities to list files in a specific project; update files annotations, create fileviews, etc.
 294
 295    TODO: Need to define the interface and rename and/or refactor some of the methods below.
 296    """
 297
 298    @tracer.start_as_current_span("SynapseStorage::__init__")
 299    def __init__(
 300        self,
 301        token: Optional[str] = None,  # optional parameter retrieved from browser cookie
 302        access_token: Optional[str] = None,
 303        project_scope: Optional[list] = None,
 304        synapse_cache_path: Optional[str] = None,
 305        perform_query: Optional[bool] = True,
 306        columns: Optional[list] = None,
 307        where_clauses: Optional[list] = None,
 308    ) -> None:
 309        """Initializes a SynapseStorage object.
 310
 311        Args:
 312            token (Optional[str], optional):
 313              Optional token parameter as found in browser cookie upon login to synapse.
 314              Defaults to None.
 315            access_token (Optional[list], optional):
 316              Optional access token (personal or oauth).
 317              Defaults to None.
 318            project_scope (Optional[list], optional): Defaults to None.
 319            synapse_cache_path (Optional[str], optional):
 320              Location of synapse cache.
 321              Defaults to None.
 322        TODO:
 323            Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how `query_fileview` is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands.
 324        """
 325        self.syn = self.login(synapse_cache_path, access_token)
 326        self.project_scope = project_scope
 327        self.storageFileview = CONFIG.synapse_master_fileview_id
 328        self.manifest = CONFIG.synapse_manifest_basename
 329        self.root_synapse_cache = self.syn.cache.cache_root_dir
 330        self.synapse_entity_tracker = SynapseEntityTracker()
 331        if perform_query:
 332            self.query_fileview(columns=columns, where_clauses=where_clauses)
 333
 334    # TODO: When moving this over to a regular cron-job the following logic should be
 335    # out of `manifest_download`:
 336    # if "SECRETS_MANAGER_SECRETS" in os.environ:
 337    #     temporary_manifest_storage = "/var/tmp/temp_manifest_download"
 338    #     cleanup_temporary_storage(temporary_manifest_storage, time_delta_seconds=3600)
 339    @tracer.start_as_current_span("SynapseStorage::_purge_synapse_cache")
 340    def _purge_synapse_cache(
 341        self, maximum_storage_allowed_cache_gb: int = 1, minute_buffer: int = 15
 342    ) -> None:
 343        """
 344        Purge synapse cache if it exceeds a certain size. Default to 1GB.
 345        Args:
 346            maximum_storage_allowed_cache_gb (int): the maximum storage allowed
 347              before purging cache. Default is 1 GB.
 348            minute_buffer (int): All files created this amount of time or older will be deleted
 349        """
 350        # try clearing the cache
 351        # scan a directory and check size of files
 352        if os.path.exists(self.root_synapse_cache):
 353            maximum_storage_allowed_cache_bytes = maximum_storage_allowed_cache_gb * (
 354                1024**3
 355            )
 356            nbytes = get_dir_size(self.root_synapse_cache)
 357            dir_size_bytes = check_synapse_cache_size(directory=self.root_synapse_cache)
 358            # if 1 GB has already been taken, purge cache before 15 min
 359            if dir_size_bytes >= maximum_storage_allowed_cache_bytes:
 360                num_of_deleted_files = clear_synapse_cache(
 361                    self.syn.cache, minutes=minute_buffer
 362                )
 363                logger.info(
 364                    f"{num_of_deleted_files}  files have been deleted from {self.root_synapse_cache}"
 365                )
 366            else:
 367                # on AWS, OS takes around 14-17% of our ephemeral storage (20GiB)
 368                # instead of guessing how much space that we left, print out .synapseCache here
 369                logger.info(f"the total size of .synapseCache is: {nbytes} bytes")
 370
 371    @tracer.start_as_current_span("SynapseStorage::query_fileview")
 372    def query_fileview(
 373        self,
 374        columns: Optional[list] = None,
 375        where_clauses: Optional[list] = None,
 376        force_requery: Optional[bool] = False,
 377    ) -> None:
 378        """
 379        Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute.
 380        Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes.
 381        Args:
 382            columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
 383            where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
 384            force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False.
 385        """
 386        self._purge_synapse_cache()
 387
 388        # Initialize to assume that the new fileview query will be different from what may already be stored. Initializes to True because generally one will not have already been performed
 389        self.new_query_different = True
 390
 391        # If a query has already been performed, store the query
 392        previous_query_built = hasattr(self, "fileview_query")
 393        if previous_query_built:
 394            previous_query = self.fileview_query
 395
 396        # Build a query with the current given parameters and check to see if it is different from the previous
 397        self._build_query(columns=columns, where_clauses=where_clauses)
 398        if previous_query_built:
 399            self.new_query_different = self.fileview_query != previous_query
 400
 401        # Only perform the query if it is different from the previous query or we are forcing new results to be retrieved
 402        if self.new_query_different or force_requery:
 403            try:
 404                self.storageFileviewTable = self.syn.tableQuery(
 405                    query=self.fileview_query,
 406                ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
 407            except SynapseHTTPError as exc:
 408                exception_text = str(exc)
 409                if "Unknown column path" in exception_text:
 410                    raise ValueError(
 411                        "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation."
 412                    )
 413                elif "Unknown column" in exception_text:
 414                    missing_column = exception_text.split("Unknown column ")[-1]
 415                    raise ValueError(
 416                        f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview."
 417                    )
 418                else:
 419                    raise AccessCredentialsError(self.storageFileview)
 420
 421    @staticmethod
 422    def build_clause_from_dataset_id(
 423        dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None
 424    ) -> str:
 425        """
 426        Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized.
 427        Args:
 428            dataset_id: Synapse ID of a dataset that should be used to limit the query
 429            dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query
 430        Returns:
 431            clause for the query or an empty string if no dataset ID is provided
 432        """
 433        # Calling this method without specifying synIDs will complete but will not scope the view
 434        if (not dataset_id) and (not dataset_folder_list):
 435            return ""
 436
 437        # This will be used to gather files under a dataset recursively with a fileview query instead of walking
 438        if dataset_folder_list:
 439            search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list)
 440            return f"parentId IN ({search_folders})"
 441
 442        # `dataset_id` should be provided when all files are stored directly under the dataset folder
 443        return f"parentId='{dataset_id}'"
 444
 445    def _build_query(
 446        self, columns: Optional[list] = None, where_clauses: Optional[list] = None
 447    ):
 448        """
 449        Method to build a query for Synapse FileViews
 450        Args:
 451            columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
 452            where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
 453            self.storageFileview (str): Synapse FileView ID
 454            self.project_scope (Optional[list], optional): List of project IDs to be used to scope the query. Defaults to None.
 455                Gets added to where_clauses, more included for backwards compatability and as a more user friendly way of subsetting the view in a simple way.
 456        """
 457        if columns is None:
 458            columns = []
 459        if where_clauses is None:
 460            where_clauses = []
 461
 462        if self.project_scope:
 463            project_scope_clause = f"projectId IN {tuple(self.project_scope + [''])}"
 464            where_clauses.append(project_scope_clause)
 465
 466        if where_clauses:
 467            where_clauses = " AND ".join(where_clauses)
 468            where_clauses = f"WHERE {where_clauses} ;"
 469        else:
 470            where_clauses = ";"
 471
 472        if columns:
 473            columns = ",".join(columns)
 474        else:
 475            columns = "*"
 476
 477        self.fileview_query = (
 478            f"SELECT {columns} FROM {self.storageFileview} {where_clauses}"
 479        )
 480
 481        return
 482
 483    @staticmethod
 484    @tracer.start_as_current_span("SynapseStorage::login")
 485    def login(
 486        synapse_cache_path: Optional[str] = None,
 487        access_token: Optional[str] = None,
 488    ) -> synapseclient.Synapse:
 489        """Login to Synapse
 490
 491        Args:
 492            access_token (Optional[str], optional): A synapse access token. Defaults to None.
 493            synapse_cache_path (Optional[str]): location of synapse cache
 494
 495        Raises:
 496            ValueError: If unable to loging with access token
 497
 498        Returns:
 499            synapseclient.Synapse: A Synapse object that is logged in
 500        """
 501        if not access_token:
 502            access_token = os.getenv("SYNAPSE_ACCESS_TOKEN")
 503
 504        # login using a token
 505        if access_token:
 506            try:
 507                syn = synapseclient.Synapse(
 508                    cache_root_dir=synapse_cache_path,
 509                    debug=False,
 510                    skip_checks=True,
 511                    cache_client=False,
 512                )
 513                syn.login(authToken=access_token, silent=True)
 514            except SynapseHTTPError as exc:
 515                raise ValueError(
 516                    "No access to resources. Please make sure that your token is correct"
 517                ) from exc
 518        else:
 519            # login using synapse credentials provided by user in .synapseConfig (default) file
 520            syn = synapseclient.Synapse(
 521                configPath=CONFIG.synapse_configuration_path,
 522                cache_root_dir=synapse_cache_path,
 523                debug=False,
 524                skip_checks=True,
 525                cache_client=False,
 526            )
 527            syn.login(silent=True)
 528
 529        # set user id attribute
 530        current_span = trace.get_current_span()
 531        if current_span.is_recording():
 532            current_span.set_attribute("user.id", syn.credentials.owner_id)
 533
 534        return syn
 535
 536    def missing_entity_handler(method):
 537        def wrapper(*args, **kwargs):
 538            try:
 539                return method(*args, **kwargs)
 540            except SynapseHTTPError as ex:
 541                str_message = str(ex).replace("\n", "")
 542                if "trash" in str_message or "does not exist" in str_message:
 543                    logging.warning(str_message)
 544                    return None
 545                else:
 546                    raise ex
 547
 548        return wrapper
 549
 550    def async_missing_entity_handler(method):
 551        """Decorator to handle missing entities in async methods."""
 552
 553        async def wrapper(*args: Any, **kwargs: Any) -> Any:
 554            try:
 555                return await method(*args, **kwargs)
 556            except SynapseHTTPError as ex:
 557                str_message = str(ex).replace("\n", "")
 558                if "trash" in str_message or "does not exist" in str_message:
 559                    logging.warning(str_message)
 560                    return None
 561                else:
 562                    raise ex
 563
 564        return wrapper
 565
 566    def getStorageFileviewTable(self):
 567        """Returns the storageFileviewTable obtained during initialization."""
 568        return self.storageFileviewTable
 569
 570    def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]:
 571        """Gets the paginated results of the REST call to Synapse to check what projects the current user has access to.
 572
 573        Args:
 574            currentUserId: synapse id for the user whose projects we want to get.
 575
 576        Returns:
 577            A dictionary with a next page token and the results.
 578        """
 579        all_results = self.syn.restGET(
 580            "/projects/user/{principalId}".format(principalId=currentUserId)
 581        )
 582
 583        while (
 584            "nextPageToken" in all_results
 585        ):  # iterate over next page token in results while there is any
 586            results_token = self.syn.restGET(
 587                "/projects/user/{principalId}?nextPageToken={nextPageToken}".format(
 588                    principalId=currentUserId,
 589                    nextPageToken=all_results["nextPageToken"],
 590                )
 591            )
 592            all_results["results"].extend(results_token["results"])
 593
 594            if "nextPageToken" in results_token:
 595                all_results["nextPageToken"] = results_token["nextPageToken"]
 596            else:
 597                del all_results["nextPageToken"]
 598
 599        return all_results
 600
 601    @tracer.start_as_current_span("SynapseStorage::getStorageProjects")
 602    def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]:
 603        """Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute.
 604
 605        Returns:
 606            A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName).
 607        """
 608
 609        # get the set of all storage Synapse project accessible for this pipeline
 610        storageProjects = self.storageFileviewTable["projectId"].unique()
 611
 612        # get the set of storage Synapse project accessible for this user
 613        # get a list of projects from Synapse
 614        current_user_project_headers = self.synapse_entity_tracker.get_project_headers(
 615            current_user_id=self.syn.credentials.owner_id, syn=self.syn
 616        )
 617        project_id_to_name_dict = {}
 618        current_user_projects = []
 619        for project_header in current_user_project_headers:
 620            project_id_to_name_dict[project_header.get("id")] = project_header.get(
 621                "name"
 622            )
 623            current_user_projects.append(project_header.get("id"))
 624
 625        # find set of user projects that are also in this pipeline's storage projects set
 626        storageProjects = list(set(storageProjects) & set(current_user_projects))
 627
 628        # Limit projects to scope if specified
 629        if project_scope:
 630            storageProjects = list(set(storageProjects) & set(project_scope))
 631
 632            if not storageProjects:
 633                raise Warning(
 634                    f"There are no projects that the user has access to that match the criteria of the specified project scope: {project_scope}"
 635                )
 636
 637        # prepare a return list of project IDs and names
 638        projects = []
 639        for projectId in storageProjects:
 640            project_name_from_project_header = project_id_to_name_dict.get(projectId)
 641            projects.append((projectId, project_name_from_project_header))
 642
 643        sorted_projects_list = sorted(projects, key=lambda tup: tup[0])
 644
 645        return sorted_projects_list
 646
 647    @tracer.start_as_current_span("SynapseStorage::getStorageDatasetsInProject")
 648    def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]:
 649        """Gets all datasets in folder under a given storage project that the current user has access to.
 650
 651        Args:
 652            projectId: synapse ID of a storage project.
 653
 654        Returns:
 655            A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName).
 656            None: If the projectId cannot be found on Synapse.
 657        """
 658
 659        # select all folders and fetch their names from within the storage project;
 660        # if folder content type is defined, only select folders that contain datasets
 661        if "contentType" in self.storageFileviewTable.columns:
 662            foldersTable = self.storageFileviewTable[
 663                (self.storageFileviewTable["contentType"] == "dataset")
 664                & (self.storageFileviewTable["projectId"] == projectId)
 665            ]
 666        else:
 667            foldersTable = self.storageFileviewTable[
 668                (self.storageFileviewTable["type"] == "folder")
 669                & (self.storageFileviewTable["parentId"] == projectId)
 670            ]
 671
 672        # get an array of tuples (folderId, folderName)
 673        # some folders are part of datasets; others contain datasets
 674        # each dataset parent is the project; folders part of a dataset have another folder as a parent
 675        # to get folders if and only if they contain datasets for each folder
 676        # check if folder's parent is the project; if so that folder contains a dataset,
 677        # unless the folder list has already been filtered to dataset folders based on contentType attribute above
 678
 679        datasetList = []
 680        folderProperties = ["id", "name"]
 681        for folder in list(
 682            foldersTable[folderProperties].itertuples(index=False, name=None)
 683        ):
 684            datasetList.append(folder)
 685
 686        sorted_dataset_list = sorted(datasetList, key=lambda tup: tup[0])
 687
 688        return sorted_dataset_list
 689
 690    @tracer.start_as_current_span("SynapseStorage::getFilesInStorageDataset")
 691    def getFilesInStorageDataset(
 692        self, datasetId: str, fileNames: List = None, fullpath: bool = True
 693    ) -> List[Tuple[str, str]]:
 694        """Gets all files (excluding manifest files) in a given dataset folder.
 695
 696        Args:
 697            datasetId: synapse ID of a storage dataset.
 698            fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g.
 699            metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present.
 700            fullpath: if True return the full path as part of this filename; otherwise return just base filename
 701
 702        Returns:
 703            A list of files; the list consists of tuples (fileId, fileName).
 704
 705        Raises:
 706            ValueError: Dataset ID not found.
 707        """
 708        file_list = []
 709
 710        # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view
 711        if self.storageFileviewTable.empty:
 712            raise ValueError(
 713                f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again."
 714            )
 715
 716        child_path = self.storageFileviewTable.loc[
 717            self.storageFileviewTable["parentId"] == datasetId, "path"
 718        ]
 719        if child_path.empty:
 720            raise LookupError(
 721                f"Dataset {datasetId} could not be found in fileview {self.storageFileview}."
 722            )
 723        child_path = child_path.iloc[0]
 724
 725        # Get the dataset path by eliminating the child's portion of the path to account for nested datasets
 726        parent = child_path.split("/")[:-1]
 727        parent = "/".join(parent)
 728
 729        # Format dataset path to be used in table query
 730        dataset_path = f"'{parent}/%'"
 731
 732        # When querying, only include files to exclude entity files and subdirectories
 733        where_clauses = [f"path like {dataset_path}", "type='file'"]
 734
 735        # Requery the fileview to specifically get the files in the given dataset
 736        self.query_fileview(columns=["id", "path"], where_clauses=where_clauses)
 737
 738        # Exclude manifest files
 739        non_manifest_files = self.storageFileviewTable.loc[
 740            ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"),
 741            :,
 742        ]
 743
 744        # Remove all files that are not in the list of fileNames
 745        if fileNames:
 746            filename_regex = "|".join(fileNames)
 747
 748            matching_files = non_manifest_files["path"].str.contains(
 749                filename_regex, case=False, regex=True
 750            )
 751
 752            non_manifest_files = non_manifest_files.loc[matching_files, :]
 753
 754        # Truncate path if necessary
 755        if not fullpath:
 756            non_manifest_files.path = non_manifest_files.path.apply(os.path.basename)
 757
 758        # Return list of files as expected by other methods
 759        file_list = list(non_manifest_files.itertuples(index=False, name=None))
 760
 761        return file_list
 762
 763    def _get_manifest_id(self, manifest: pd.DataFrame) -> str:
 764        """If both censored and uncensored manifests are present, return uncensored manifest; if only one manifest is present, return manifest id of that manifest; if more than two manifests are present, return the manifest id of the first one.
 765        Args:
 766        manifest: a dataframe contains name and id of manifests in a given asset view
 767
 768        Return:
 769        manifest_syn_id: id of a given censored or uncensored manifest
 770        """
 771        censored_regex = re.compile(".*censored.*")
 772        censored = manifest["name"].str.contains(censored_regex)
 773        if any(censored):
 774            # Try to use uncensored manifest first
 775            not_censored = ~censored
 776            if any(not_censored):
 777                manifest_syn_id = manifest[not_censored]["id"].iloc[0]
 778            # if only censored manifests are available, just use the first censored manifest
 779            else:
 780                manifest_syn_id = manifest["id"].iloc[0]
 781
 782        # otherwise, use the first (implied only) version that exists
 783        else:
 784            manifest_syn_id = manifest["id"].iloc[0]
 785
 786        return manifest_syn_id
 787
 788    @tracer.start_as_current_span("SynapseStorage::getDatasetManifest")
 789    def getDatasetManifest(
 790        self,
 791        datasetId: str,
 792        downloadFile: bool = False,
 793        newManifestName: str = "",
 794        use_temporary_folder: bool = True,
 795    ) -> Union[str, File]:
 796        """Gets the manifest associated with a given dataset.
 797
 798        Args:
 799            datasetId: synapse ID of a storage dataset.
 800            downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not.
 801            newManifestName: new name of a manifest that gets downloaded
 802            use_temporary_folder: boolean argument indicating if a temporary folder
 803                should be used to store the manifest file. This is useful when running
 804                this code as an API server where multiple requests could be made at the
 805                same time. This is set to False when the code is being used from the
 806                CLI. Defaults to True.
 807
 808        Returns:
 809            manifest_syn_id (String): Synapse ID of exisiting manifest file.
 810            manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True.
 811            "" (String): No pre-exisiting manifest in dataset.
 812        """
 813        manifest_data = ""
 814
 815        # get a list of files containing the manifest for this dataset (if any)
 816        all_files = self.storageFileviewTable
 817
 818        # construct regex based on manifest basename in the config
 819        manifest_re = re.compile(os.path.basename(self.manifest) + ".*.[tc]sv")
 820
 821        # search manifest based on given manifest basename regex above
 822        # and return a dataframe containing name and id of manifests in a given asset view
 823        manifest = all_files[
 824            (all_files["name"].str.contains(manifest_re, regex=True))
 825            & (all_files["parentId"] == datasetId)
 826        ]
 827
 828        manifest = manifest[["id", "name"]]
 829
 830        # if there is no pre-exisiting manifest in the specified dataset
 831        if manifest.empty:
 832            logger.warning(
 833                f"Could not find a manifest that fits basename {self.manifest} in asset view and dataset {datasetId}"
 834            )
 835            return ""
 836
 837        # if there is an exisiting manifest
 838        else:
 839            manifest_syn_id = self._get_manifest_id(manifest)
 840            if downloadFile:
 841                md = ManifestDownload(
 842                    self.syn,
 843                    manifest_id=manifest_syn_id,
 844                    synapse_entity_tracker=self.synapse_entity_tracker,
 845                )
 846                manifest_data = md.download_manifest(
 847                    newManifestName=newManifestName,
 848                    manifest_df=manifest,
 849                    use_temporary_folder=use_temporary_folder,
 850                )
 851                # TO DO: revisit how downstream code handle manifest_data. If the downstream code would break when manifest_data is an empty string,
 852                # then we should catch the error here without returning an empty string.
 853                if not manifest_data:
 854                    logger.debug(
 855                        f"No manifest data returned. Please check if you have successfully downloaded manifest: {manifest_syn_id}"
 856                    )
 857                return manifest_data
 858            return manifest_syn_id
 859
 860    def getDataTypeFromManifest(self, manifestId: str):
 861        """Fetch a manifest and return data types of all columns
 862        Args:
 863            manifestId: synapse ID of a manifest
 864        """
 865        # get manifest file path
 866        manifest_entity = self.synapse_entity_tracker.get(
 867            synapse_id=manifestId, syn=self.syn, download_file=True
 868        )
 869        manifest_filepath = manifest_entity.path
 870
 871        # load manifest dataframe
 872        manifest = load_df(
 873            manifest_filepath,
 874            preserve_raw_input=False,
 875            data_model=False,
 876        )
 877
 878        # convert the dataFrame to use best possible dtypes.
 879        manifest_new = manifest.convert_dtypes()
 880
 881        # get data types of columns
 882        result = manifest_new.dtypes.to_frame("dtypes").reset_index()
 883
 884        # return the result as a dictionary
 885        result_dict = result.set_index("index")["dtypes"].astype(str).to_dict()
 886
 887        return result_dict
 888
 889    def _get_files_metadata_from_dataset(
 890        self, datasetId: str, only_new_files: bool, manifest: pd.DataFrame = None
 891    ) -> Optional[dict]:
 892        """retrieve file ids under a particular datasetId
 893
 894        Args:
 895            datasetId (str): a dataset id
 896            only_new_files (bool): if only adding new files that are not already exist
 897            manifest (pd.DataFrame): metadata manifest dataframe. Default to None.
 898
 899        Returns:
 900            a dictionary that contains filename and entityid under a given datasetId or None if there is nothing under a given dataset id are not available
 901        """
 902        dataset_files = self.getFilesInStorageDataset(datasetId)
 903        if dataset_files:
 904            dataset_file_names_id_dict = self._get_file_entityIds(
 905                dataset_files, only_new_files=only_new_files, manifest=manifest
 906            )
 907            return dataset_file_names_id_dict
 908        else:
 909            return None
 910
 911    def add_entity_id_and_filename(
 912        self, datasetId: str, manifest: pd.DataFrame
 913    ) -> pd.DataFrame:
 914        """add entityid and filename column to an existing manifest assuming entityId column is not already present
 915
 916        Args:
 917            datasetId (str): dataset syn id
 918            manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty
 919
 920        Returns:
 921            pd.DataFrame: returns a pandas dataframe
 922        """
 923        # get file names and entity ids of a given dataset
 924        dataset_files_dict = self._get_files_metadata_from_dataset(
 925            datasetId, only_new_files=False
 926        )
 927
 928        if dataset_files_dict:
 929            # turn manifest dataframe back to a dictionary for operation
 930            manifest_dict = manifest.to_dict("list")
 931
 932            # update Filename column
 933            # add entityId column to the end
 934            manifest_dict.update(dataset_files_dict)
 935
 936            # if the component column exists in existing manifest, fill up that column
 937            if "Component" in manifest_dict.keys():
 938                manifest_dict["Component"] = manifest_dict["Component"] * max(
 939                    1, len(manifest_dict["Filename"])
 940                )
 941
 942            # turn dictionary back to a dataframe
 943            manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient="index")
 944            manifest_df_updated = manifest_df_index.transpose()
 945
 946            # fill na with empty string
 947            manifest_df_updated = manifest_df_updated.fillna("")
 948
 949            # drop index
 950            manifest_df_updated = manifest_df_updated.reset_index(drop=True)
 951
 952            return manifest_df_updated
 953        else:
 954            return manifest
 955
 956    def fill_in_entity_id_filename(
 957        self, datasetId: str, manifest: pd.DataFrame
 958    ) -> Tuple[List, pd.DataFrame]:
 959        """fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present.
 960
 961        Args:
 962            datasetId (str): dataset syn id
 963            manifest (pd.DataFrame): existing manifest dataframe.
 964
 965        Returns:
 966            Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe
 967        """
 968        # get dataset file names and entity id as a list of tuple
 969        dataset_files = self.getFilesInStorageDataset(datasetId)
 970
 971        # update manifest with additional filenames, if any
 972        # note that if there is an existing manifest and there are files in the dataset
 973        # the columns Filename and entityId are assumed to be present in manifest schema
 974        # TODO: use idiomatic panda syntax
 975        if not dataset_files:
 976            manifest = manifest.fillna("")
 977            return dataset_files, manifest
 978
 979        all_files = self._get_file_entityIds(
 980            dataset_files=dataset_files, only_new_files=False, manifest=manifest
 981        )
 982        new_files = self._get_file_entityIds(
 983            dataset_files=dataset_files, only_new_files=True, manifest=manifest
 984        )
 985
 986        all_files = pd.DataFrame(all_files)
 987        new_files = pd.DataFrame(new_files)
 988
 989        # update manifest so that it contains new dataset files
 990        manifest = (
 991            pd.concat([manifest, new_files], sort=False)
 992            .reset_index()
 993            .drop("index", axis=1)
 994        )
 995
 996        # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata
 997        manifest_reindex = manifest.set_index("entityId")
 998        all_files_reindex = all_files.set_index("entityId")
 999        all_files_reindex_like_manifest = all_files_reindex.reindex_like(
1000            manifest_reindex
1001        )
1002
1003        # Check if individual file paths in manifest and from synapse match
1004        file_paths_match = (
1005            manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"]
1006        )
1007
1008        # If all the paths do not match, update the manifest with the filepaths from synapse
1009        if not file_paths_match.all():
1010            manifest_reindex.loc[
1011                ~file_paths_match, "Filename"
1012            ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"]
1013
1014            # reformat manifest for further use
1015            manifest = manifest_reindex.reset_index()
1016            entityIdCol = manifest.pop("entityId")
1017            manifest.insert(len(manifest.columns), "entityId", entityIdCol)
1018
1019        manifest = manifest.fillna("")
1020        return dataset_files, manifest
1021
1022    @tracer.start_as_current_span("SynapseStorage::updateDatasetManifestFiles")
1023    def updateDatasetManifestFiles(
1024        self, dmge: DataModelGraphExplorer, datasetId: str, store: bool = True
1025    ) -> Union[Tuple[str, pd.DataFrame], None]:
1026        """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any.
1027
1028        Args:
1029            dmge: DataModelGraphExplorer Instance
1030            datasetId: synapse ID of a storage dataset.
1031            store: if set to True store updated manifest in asset store; if set to False
1032            return a Pandas dataframe containing updated manifest but do not store to asset store
1033
1034
1035        Returns:
1036            Synapse ID of updated manifest and Pandas dataframe containing the updated manifest.
1037            If there is no existing manifest or if the manifest does not have an entityId column, return None
1038        """
1039
1040        # get existing manifest Synapse ID
1041        manifest_id = self.getDatasetManifest(datasetId)
1042
1043        # if there is no manifest return None
1044        if not manifest_id:
1045            return None
1046
1047        manifest_entity = self.synapse_entity_tracker.get(
1048            synapse_id=manifest_id, syn=self.syn, download_file=True
1049        )
1050        manifest_filepath = manifest_entity.path
1051        manifest = load_df(manifest_filepath)
1052
1053        # If the manifest does not have an entityId column, trigger a new manifest to be generated
1054        if "entityId" not in manifest.columns:
1055            return None
1056
1057        manifest_is_file_based = "Filename" in manifest.columns
1058
1059        if manifest_is_file_based:
1060            # update manifest with additional filenames, if any
1061            # note that if there is an existing manifest and there are files in the dataset
1062            # the columns Filename and entityId are assumed to be present in manifest schema
1063            # TODO: use idiomatic panda syntax
1064            dataset_files, manifest = self.fill_in_entity_id_filename(
1065                datasetId, manifest
1066            )
1067            if dataset_files:
1068                # update the manifest file, so that it contains the relevant entity IDs
1069                if store:
1070                    manifest.to_csv(manifest_filepath, index=False)
1071
1072                    # store manifest and update associated metadata with manifest on Synapse
1073                    manifest_id = self.associateMetadataWithFiles(
1074                        dmge, manifest_filepath, datasetId
1075                    )
1076
1077        return manifest_id, manifest
1078
1079    def _get_file_entityIds(
1080        self,
1081        dataset_files: List,
1082        only_new_files: bool = False,
1083        manifest: pd.DataFrame = None,
1084    ):
1085        """
1086        Get a dictionary of files in a dataset. Either files that are not in the current manifest or all files
1087
1088        Args:
1089            manifest: metadata manifest
1090            dataset_file: List of all files in a dataset
1091            only_new_files: boolean to control whether only new files are returned or all files in the dataset
1092        Returns:
1093            files: dictionary of file names and entityIDs, with scope as specified by `only_new_files`
1094        """
1095        files = {"Filename": [], "entityId": []}
1096
1097        if only_new_files:
1098            if manifest is None:
1099                raise UnboundLocalError(
1100                    "No manifest was passed in, a manifest is required when `only_new_files` is True."
1101                )
1102
1103            if "entityId" not in manifest.columns:
1104                raise ValueError(
1105                    "The manifest in your dataset and/or top level folder must contain the 'entityId' column. "
1106                    "Please generate an empty manifest without annotations, manually add annotations to the "
1107                    "appropriate files in the manifest, and then try again."
1108                )
1109
1110            # find new files (that are not in the current manifest) if any
1111            for file_id, file_name in dataset_files:
1112                if not file_id in manifest["entityId"].values:
1113                    files["Filename"].append(file_name)
1114                    files["entityId"].append(file_id)
1115        else:
1116            # get all files
1117            for file_id, file_name in dataset_files:
1118                files["Filename"].append(file_name)
1119                files["entityId"].append(file_id)
1120
1121        return files
1122
1123    @tracer.start_as_current_span("SynapseStorage::getProjectManifests")
1124    def getProjectManifests(
1125        self, projectId: str
1126    ) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]:
1127        """Gets all metadata manifest files across all datasets in a specified project.
1128
1129        Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest
1130                 as a list of tuples, one for each manifest:
1131                    [
1132                        (
1133                            (datasetId, dataName),
1134                            (manifestId, manifestName),
1135                            (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema
1136                        ),
1137                        ...
1138                    ]
1139
1140        TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface
1141        """
1142        component = None
1143        entity = None
1144        manifests = []
1145
1146        datasets = self.getStorageDatasetsInProject(projectId)
1147
1148        for datasetId, datasetName in datasets:
1149            # encode information about the manifest in a simple list (so that R clients can unpack it)
1150            # eventually can serialize differently
1151
1152            # Get synID of manifest for a dataset
1153            manifestId = self.getDatasetManifest(datasetId)
1154
1155            # If a manifest exists, get the annotations for it, else return base 'manifest' tuple
1156            if manifestId:
1157                annotations = self.getFileAnnotations(manifestId)
1158
1159                # If manifest has annotations specifying component, use that
1160                if annotations and "Component" in annotations:
1161                    component = annotations["Component"]
1162                    entity = self.synapse_entity_tracker.get(
1163                        synapse_id=manifestId, syn=self.syn, download_file=False
1164                    )
1165                    manifest_name = entity["properties"]["name"]
1166
1167                # otherwise download the manifest and parse for information
1168                elif not annotations or "Component" not in annotations:
1169                    logging.debug(
1170                        f"No component annotations have been found for manifest {manifestId}. "
1171                        "The manifest will be downloaded and parsed instead. "
1172                        "For increased speed, add component annotations to manifest."
1173                    )
1174
1175                    manifest_info = self.getDatasetManifest(
1176                        datasetId, downloadFile=True
1177                    )
1178                    manifest_name = manifest_info["properties"].get("name", "")
1179
1180                    if not manifest_name:
1181                        logger.error(f"Failed to download manifests from {datasetId}")
1182
1183                    manifest_path = manifest_info["path"]
1184
1185                    manifest_df = load_df(manifest_path)
1186
1187                    # Get component from component column if it exists
1188                    if (
1189                        "Component" in manifest_df
1190                        and not manifest_df["Component"].empty
1191                    ):
1192                        list(set(manifest_df["Component"]))
1193                        component = list(set(manifest_df["Component"]))
1194
1195                        # Added to address issues raised during DCA testing
1196                        if "" in component:
1197                            component.remove("")
1198
1199                        if len(component) == 1:
1200                            component = component[0]
1201                        elif len(component) > 1:
1202                            logging.warning(
1203                                f"Manifest {manifestId} is composed of multiple components. Schematic does not support mulit-component manifests at this time."
1204                                "Behavior of manifests with multiple components is undefined"
1205                            )
1206            else:
1207                manifest_name = ""
1208                component = None
1209            if component:
1210                manifest = (
1211                    (datasetId, datasetName),
1212                    (manifestId, manifest_name),
1213                    (component, component),
1214                )
1215            elif manifestId:
1216                logging.debug(
1217                    f"Manifest {manifestId} does not have an associated Component"
1218                )
1219                manifest = (
1220                    (datasetId, datasetName),
1221                    (manifestId, manifest_name),
1222                    ("", ""),
1223                )
1224            else:
1225                manifest = (
1226                    (datasetId, datasetName),
1227                    ("", ""),
1228                    ("", ""),
1229                )
1230
1231            if manifest:
1232                manifests.append(manifest)
1233
1234        return manifests
1235
1236    def upload_project_manifests_to_synapse(
1237        self, dmge: DataModelGraphExplorer, projectId: str
1238    ) -> List[str]:
1239        """Upload all metadata manifest files across all datasets in a specified project as tables in Synapse.
1240
1241        Returns: String of all the manifest_table_ids of all the manifests that have been loaded.
1242        """
1243
1244        manifests = []
1245        manifest_loaded = []
1246        datasets = self.getStorageDatasetsInProject(projectId)
1247
1248        for datasetId, datasetName in datasets:
1249            # encode information about the manifest in a simple list (so that R clients can unpack it)
1250            # eventually can serialize differently
1251
1252            manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1253
1254            manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1255            if manifest_info:
1256                manifest_id = manifest_info["properties"]["id"]
1257                manifest_name = manifest_info["properties"]["name"]
1258                manifest_path = manifest_info["path"]
1259                manifest_df = load_df(manifest_path)
1260                manifest_table_id = uploadDB(
1261                    dmge=dmge,
1262                    manifest=manifest,
1263                    datasetId=datasetId,
1264                    table_name=datasetName,
1265                )
1266                manifest_loaded.append(datasetName)
1267        return manifest_loaded
1268
1269    def upload_annotated_project_manifests_to_synapse(
1270        self, projectId: str, path_to_json_ld: str, dry_run: bool = False
1271    ) -> List[str]:
1272        """
1273        Purpose:
1274            For all manifests in a project, upload them as a table and add annotations manifest csv.
1275            Assumes the manifest is already present as a CSV in a dataset in the project.
1276
1277        """
1278        # Instantiate DataModelParser
1279        data_model_parser = DataModelParser(path_to_data_model=path_to_json_ld)
1280        # Parse Model
1281        parsed_data_model = data_model_parser.parse_model()
1282
1283        # Instantiate DataModelGraph
1284        data_model_grapher = DataModelGraph(parsed_data_model)
1285
1286        # Generate graph
1287        graph_data_model = data_model_grapher.generate_data_model_graph()
1288
1289        # Instantiate DataModelGraphExplorer
1290        dmge = DataModelGraphExplorer(graph_data_model)
1291
1292        manifests = []
1293        manifest_loaded = []
1294        datasets = self.getStorageDatasetsInProject(projectId)
1295        for datasetId, datasetName in datasets:
1296            # encode information about the manifest in a simple list (so that R clients can unpack it)
1297            # eventually can serialize differently
1298
1299            manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1300            manifests.append(manifest)
1301
1302            manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1303
1304            if manifest_info:
1305                manifest_id = manifest_info["properties"]["id"]
1306                manifest_name = manifest_info["properties"]["name"]
1307                manifest_path = manifest_info["path"]
1308                manifest = (
1309                    (datasetId, datasetName),
1310                    (manifest_id, manifest_name),
1311                    ("", ""),
1312                )
1313                if not dry_run:
1314                    self.associateMetadataWithFiles(
1315                        dmge, manifest_path, datasetId, manifest_record_type="table"
1316                    )
1317                manifest_loaded.append(manifest)
1318
1319        return manifests, manifest_loaded
1320
1321    def move_entities_to_new_project(
1322        self,
1323        projectId: str,
1324        newProjectId: str,
1325        returnEntities: bool = False,
1326        dry_run: bool = False,
1327    ):
1328        """
1329        For each manifest csv in a project, look for all the entitiy ids that are associated.
1330        Look up the entitiy in the files, move the entity to new project.
1331        """
1332
1333        manifests = []
1334        manifest_loaded = []
1335        datasets = self.getStorageDatasetsInProject(projectId)
1336        if datasets:
1337            for datasetId, datasetName in datasets:
1338                # encode information about the manifest in a simple list (so that R clients can unpack it)
1339                # eventually can serialize differently
1340
1341                manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1342                manifests.append(manifest)
1343
1344                manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1345                if manifest_info:
1346                    manifest_id = manifest_info["properties"]["id"]
1347                    manifest_name = manifest_info["properties"]["name"]
1348                    manifest_path = manifest_info["path"]
1349                    manifest_df = load_df(manifest_path)
1350
1351                    manifest = (
1352                        (datasetId, datasetName),
1353                        (manifest_id, manifest_name),
1354                        ("", ""),
1355                    )
1356                    manifest_loaded.append(manifest)
1357
1358                    annotation_entities = self.storageFileviewTable[
1359                        (self.storageFileviewTable["id"].isin(manifest_df["entityId"]))
1360                        & (self.storageFileviewTable["type"] == "folder")
1361                    ]["id"]
1362
1363                    if returnEntities:
1364                        for entityId in annotation_entities:
1365                            if not dry_run:
1366                                moved_entity = self.syn.move(entityId, datasetId)
1367                                self.synapse_entity_tracker.add(
1368                                    synapse_id=moved_entity.id, entity=moved_entity
1369                                )
1370                            else:
1371                                logging.info(
1372                                    f"{entityId} will be moved to folder {datasetId}."
1373                                )
1374                    else:
1375                        # generate project folder
1376                        archive_project_folder = Folder(
1377                            projectId + "_archive", parent=newProjectId
1378                        )
1379                        archive_project_folder = self.syn.store(archive_project_folder)
1380                        self.synapse_entity_tracker.add(
1381                            synapse_id=archive_project_folder.id,
1382                            entity=archive_project_folder,
1383                        )
1384
1385                        # generate dataset folder
1386                        dataset_archive_folder = Folder(
1387                            "_".join([datasetId, datasetName, "archive"]),
1388                            parent=archive_project_folder.id,
1389                        )
1390                        dataset_archive_folder = self.syn.store(dataset_archive_folder)
1391                        self.synapse_entity_tracker.add(
1392                            synapse_id=dataset_archive_folder.id,
1393                            entity=dataset_archive_folder,
1394                        )
1395
1396                        for entityId in annotation_entities:
1397                            # move entities to folder
1398                            if not dry_run:
1399                                moved_entity = self.syn.move(
1400                                    entityId, dataset_archive_folder.id
1401                                )
1402                                self.synapse_entity_tracker.add(
1403                                    synapse_id=moved_entity.id, entity=moved_entity
1404                                )
1405                            else:
1406                                logging.info(
1407                                    f"{entityId} will be moved to folder {dataset_archive_folder.id}."
1408                                )
1409        else:
1410            raise LookupError(
1411                f"No datasets were found in the specified project: {projectId}. Re-check specified master_fileview in CONFIG and retry."
1412            )
1413        return manifests, manifest_loaded
1414
1415    @tracer.start_as_current_span("SynapseStorage::get_synapse_table")
1416    def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable]:
1417        """Download synapse table as a pd dataframe; return table schema and etags as results too
1418
1419        Args:
1420            synapse_id: synapse ID of the table to query
1421        """
1422
1423        results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id))
1424        df = results.asDataFrame(
1425            rowIdAndVersionInIndex=False,
1426            na_values=STR_NA_VALUES_FILTERED,
1427            keep_default_na=False,
1428        )
1429
1430        return df, results
1431
1432    @missing_entity_handler
1433    @tracer.start_as_current_span("SynapseStorage::uploadDB")
1434    def uploadDB(
1435        self,
1436        dmge: DataModelGraphExplorer,
1437        manifest: pd.DataFrame,
1438        datasetId: str,
1439        table_name: str,
1440        restrict: bool = False,
1441        table_manipulation: str = "replace",
1442        table_column_names: str = "class_label",
1443    ):
1444        """
1445        Method to upload a database to an asset store. In synapse, this will upload a metadata table
1446
1447        Args:
1448            dmge: DataModelGraphExplorer object
1449            manifest: pd.Df manifest to upload
1450            datasetId: synID of the dataset for the manifest
1451            table_name: name of the table to be uploaded
1452            restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
1453            existingTableId: str of the synId of the existing table, if one already exists
1454            table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
1455            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
1456                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
1457                display label formatting.
1458        Returns:
1459            manifest_table_id: synID of the uploaded table
1460            manifest: the original manifset
1461            table_manifest: manifest formatted appropriately for the table
1462
1463        """
1464
1465        col_schema, table_manifest = self.formatDB(
1466            dmge=dmge, manifest=manifest, table_column_names=table_column_names
1467        )
1468
1469        manifest_table_id = self.buildDB(
1470            datasetId,
1471            table_name,
1472            col_schema,
1473            table_manifest,
1474            table_manipulation,
1475            dmge,
1476            restrict,
1477        )
1478
1479        return manifest_table_id, manifest, table_manifest
1480
1481    @tracer.start_as_current_span("SynapseStorage::formatDB")
1482    def formatDB(self, dmge, manifest, table_column_names):
1483        """
1484        Method to format a manifest appropriatly for upload as table
1485
1486        Args:
1487            dmge: DataModelGraphExplorer object
1488            manifest: pd.Df manifest to upload
1489            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
1490                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
1491                display label formatting.
1492        Returns:
1493            col_schema: schema for table columns: type, size, etc
1494            table_manifest: formatted manifest
1495
1496        """
1497        # Rename the manifest columns to display names to match fileview
1498
1499        blacklist_chars = ["(", ")", ".", " ", "-"]
1500        manifest_columns = manifest.columns.tolist()
1501
1502        table_manifest = deepcopy(manifest)
1503
1504        if table_column_names == "display_name":
1505            cols = table_manifest.columns
1506
1507        elif table_column_names == "display_label":
1508            cols = [
1509                str(col).translate({ord(x): "" for x in blacklist_chars})
1510                for col in manifest_columns
1511            ]
1512
1513        elif table_column_names == "class_label":
1514            cols = [
1515                get_class_label_from_display_name(str(col)).translate(
1516                    {ord(x): "" for x in blacklist_chars}
1517                )
1518                for col in manifest_columns
1519            ]
1520        else:
1521            ValueError(
1522                f"The provided table_column_name: {table_column_names} is not valid, please resubmit with an allowed value only."
1523            )
1524
1525        cols = list(map(lambda x: x.replace("EntityId", "entityId"), cols))
1526
1527        # Reset column names in table manifest
1528        table_manifest.columns = cols
1529
1530        # move entity id to end of df
1531        entity_col = table_manifest.pop("entityId")
1532        table_manifest.insert(len(table_manifest.columns), "entityId", entity_col)
1533
1534        # Get the column schema
1535        col_schema = as_table_columns(table_manifest)
1536
1537        # Set Id column length to 64 (for some reason not being auto set.)
1538        for i, col in enumerate(col_schema):
1539            if col["name"].lower() == "id":
1540                col_schema[i]["maximumSize"] = 64
1541
1542        return col_schema, table_manifest
1543
1544    @tracer.start_as_current_span("SynapseStorage::buildDB")
1545    def buildDB(
1546        self,
1547        datasetId: str,
1548        table_name: str,
1549        col_schema: List,
1550        table_manifest: pd.DataFrame,
1551        table_manipulation: str,
1552        dmge: DataModelGraphExplorer,
1553        restrict: bool = False,
1554    ):
1555        """
1556        Method to construct the table appropriately: create new table, replace existing, or upsert new into existing
1557        Calls TableOperations class to execute
1558
1559        Args:
1560            datasetId: synID of the dataset for the manifest
1561            table_name: name of the table to be uploaded
1562            col_schema: schema for table columns: type, size, etc from `formatDB`
1563            table_manifest: formatted manifest that can be uploaded as a table
1564            table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
1565            restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
1566
1567        Returns:
1568            manifest_table_id: synID of the uploaded table
1569
1570        """
1571        table_parent_id = self.getDatasetProject(datasetId=datasetId)
1572        existing_table_id = self.syn.findEntityId(
1573            name=table_name, parent=table_parent_id
1574        )
1575
1576        tableOps = TableOperations(
1577            synStore=self,
1578            tableToLoad=table_manifest,
1579            tableName=table_name,
1580            datasetId=datasetId,
1581            existingTableId=existing_table_id,
1582            restrict=restrict,
1583            synapse_entity_tracker=self.synapse_entity_tracker,
1584        )
1585
1586        if not table_manipulation or existing_table_id is None:
1587            manifest_table_id = tableOps.createTable(
1588                columnTypeDict=col_schema,
1589                specifySchema=True,
1590            )
1591        elif existing_table_id is not None:
1592            if table_manipulation.lower() == "replace":
1593                manifest_table_id = tableOps.replaceTable(
1594                    specifySchema=True,
1595                    columnTypeDict=col_schema,
1596                )
1597            elif table_manipulation.lower() == "upsert":
1598                manifest_table_id = tableOps.upsertTable(
1599                    dmge=dmge,
1600                )
1601            elif table_manipulation.lower() == "update":
1602                manifest_table_id = tableOps.updateTable()
1603
1604        if table_manipulation and table_manipulation.lower() == "upsert":
1605            table_entity = self.synapse_entity_tracker.get(
1606                synapse_id=existing_table_id or manifest_table_id,
1607                syn=self.syn,
1608                download_file=False,
1609            )
1610            annos = OldAnnotations(
1611                id=table_entity.id,
1612                etag=table_entity.etag,
1613                values=table_entity.annotations,
1614            )
1615            annos["primary_key"] = table_manifest["Component"][0] + "_id"
1616            annos = self.syn.set_annotations(annos)
1617            table_entity.etag = annos.etag
1618            table_entity.annotations = annos
1619
1620        return manifest_table_id
1621
1622    @tracer.start_as_current_span("SynapseStorage::upload_manifest_file")
1623    def upload_manifest_file(
1624        self,
1625        manifest,
1626        metadataManifestPath,
1627        datasetId,
1628        restrict_manifest,
1629        component_name="",
1630    ):
1631        # Update manifest to have the new entityId column
1632        manifest.to_csv(metadataManifestPath, index=False)
1633
1634        # store manifest to Synapse as a CSV
1635        # update file name
1636        file_name_full = metadataManifestPath.split("/")[-1]
1637        file_extension = file_name_full.split(".")[-1]
1638
1639        # Differentiate "censored" and "uncensored" manifest
1640        if "censored" in file_name_full:
1641            file_name_new = (
1642                os.path.basename(CONFIG.synapse_manifest_basename)
1643                + "_"
1644                + component_name
1645                + "_censored"
1646                + "."
1647                + file_extension
1648            )
1649        else:
1650            file_name_new = (
1651                os.path.basename(CONFIG.synapse_manifest_basename)
1652                + "_"
1653                + component_name
1654                + "."
1655                + file_extension
1656            )
1657
1658        manifest_synapse_file = None
1659        try:
1660            # Rename the file to file_name_new then revert
1661            # This is to maintain the original file name in-case other code is
1662            # expecting that the file exists with the original name
1663            original_file_path = metadataManifestPath
1664            new_file_path = os.path.join(
1665                os.path.dirname(metadataManifestPath), file_name_new
1666            )
1667            os.rename(original_file_path, new_file_path)
1668
1669            manifest_synapse_file = self._store_file_for_manifest_upload(
1670                new_file_path=new_file_path,
1671                dataset_id=datasetId,
1672                existing_file_name=file_name_full,
1673                file_name_new=file_name_new,
1674                restrict_manifest=restrict_manifest,
1675            )
1676            manifest_synapse_file_id = manifest_synapse_file.id
1677
1678        finally:
1679            # Revert the file name back to the original
1680            os.rename(new_file_path, original_file_path)
1681
1682            if manifest_synapse_file:
1683                manifest_synapse_file.path = original_file_path
1684
1685        return manifest_synapse_file_id
1686
1687    def _store_file_for_manifest_upload(
1688        self,
1689        new_file_path: str,
1690        dataset_id: str,
1691        existing_file_name: str,
1692        file_name_new: str,
1693        restrict_manifest: bool,
1694    ) -> File:
1695        """Handles a create or update of a manifest file that is going to be uploaded.
1696        If we already have a copy of the Entity in memory we will update that instance,
1697        otherwise create a new File instance to be created in Synapse. Once stored
1698        this will add the file to the `synapse_entity_tracker` for future reference.
1699
1700        Args:
1701            new_file_path (str): The path to the new manifest file
1702            dataset_id (str): The Synapse ID of the dataset the manifest is associated with
1703            existing_file_name (str): The name of the existing file
1704            file_name_new (str): The name of the new file
1705            restrict_manifest (bool): Whether the manifest should be restricted
1706
1707        Returns:
1708            File: The stored manifest file
1709        """
1710        local_tracked_file_instance = (
1711            self.synapse_entity_tracker.search_local_by_parent_and_name(
1712                name=existing_file_name, parent_id=dataset_id
1713            )
1714            or self.synapse_entity_tracker.search_local_by_parent_and_name(
1715                name=file_name_new, parent_id=dataset_id
1716            )
1717        )
1718
1719        if local_tracked_file_instance:
1720            local_tracked_file_instance.path = new_file_path
1721            local_tracked_file_instance.description = (
1722                "Manifest for dataset " + dataset_id
1723            )
1724            manifest_synapse_file = local_tracked_file_instance
1725        else:
1726            manifest_synapse_file = File(
1727                path=new_file_path,
1728                description="Manifest for dataset " + dataset_id,
1729                parent=dataset_id,
1730                name=file_name_new,
1731            )
1732
1733        manifest_synapse_file = self.syn.store(
1734            manifest_synapse_file, isRestricted=restrict_manifest
1735        )
1736
1737        self.synapse_entity_tracker.add(
1738            synapse_id=manifest_synapse_file.id, entity=manifest_synapse_file
1739        )
1740        return manifest_synapse_file
1741
1742    async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]:
1743        """get annotations asynchronously
1744
1745        Args:
1746            synapse_id (str): synapse id of the entity that the annotation belongs
1747
1748        Returns:
1749            Dict[str, Any]: The requested entity bundle matching
1750            <https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html>
1751        """
1752        return await get_entity_id_bundle2(
1753            entity_id=synapse_id,
1754            request={"includeAnnotations": True},
1755            synapse_client=self.syn,
1756        )
1757
1758    async def store_async_annotation(self, annotation_dict: dict) -> Annotations:
1759        """store annotation in an async way
1760
1761        Args:
1762            annotation_dict (dict): annotation in a dictionary format
1763
1764        Returns:
1765            Annotations: The stored annotations.
1766        """
1767        annotation_data = Annotations.from_dict(
1768            synapse_annotations=annotation_dict["annotations"]["annotations"]
1769        )
1770        annotation_class = Annotations(
1771            annotations=annotation_data,
1772            etag=annotation_dict["annotations"]["etag"],
1773            id=annotation_dict["annotations"]["id"],
1774        )
1775        annotation_storage_result = await annotation_class.store_async(
1776            synapse_client=self.syn
1777        )
1778        local_entity = self.synapse_entity_tracker.get(
1779            synapse_id=annotation_dict["annotations"]["id"],
1780            syn=self.syn,
1781            download_file=False,
1782            retrieve_if_not_present=False,
1783        )
1784        if local_entity:
1785            local_entity.etag = annotation_storage_result.etag
1786            local_entity.annotations = annotation_storage_result
1787        return annotation_storage_result
1788
1789    def process_row_annotations(
1790        self,
1791        dmge: DataModelGraphExplorer,
1792        metadata_syn: Dict[str, Any],
1793        hide_blanks: bool,
1794        csv_list_regex: str,
1795        annos: Dict[str, Any],
1796        annotation_keys: str,
1797    ) -> Dict[str, Any]:
1798        """Processes metadata annotations based on the logic below:
1799        1. Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is:
1800            An empty or whitespace-only string.
1801            A NaN value (if the annotation is a float).
1802        if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key.
1803        if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key.
1804
1805        2. If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name".
1806        Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key.
1807
1808        3. For any other conditions, assigns the original value of anno_v to the annotation key (anno_k).
1809
1810        4. Returns the updated annotations dictionary.
1811
1812        Args:
1813            dmge (DataModelGraphExplorer): data model graph explorer
1814            metadata_syn (dict): metadata used for Synapse storage
1815            hideBlanks (bool): if true, does not upload annotation keys with blank values.
1816            csv_list_regex (str): Regex to match with comma separated list
1817            annos (Dict[str, Any]): dictionary of annotation returned from synapse
1818            annotation_keys (str): display_label/class_label
1819
1820        Returns:
1821            Dict[str, Any]: annotations as a dictionary
1822
1823        ```mermaid
1824        flowchart TD
1825            A[Start] --> C{Is anno_v empty, whitespace, or NaN?}
1826            C -- Yes --> D{Is hide_blanks True?}
1827            D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing]
1828            D -- No --> F[Assign empty string to annotation key]
1829            C -- No --> G{Is anno_v a string?}
1830            G -- No --> H[Assign original value of anno_v to annotation key]
1831            G -- Yes --> I{Does anno_v match csv_list_regex?}
1832            I -- Yes --> J[Get validation rule of anno_k]
1833            J --> K{Does the validation rule contain 'list'}
1834            K -- Yes --> L[Split anno_v by commas and assign as list]
1835            I -- No --> H
1836            K -- No --> H
1837        ```
1838        """
1839        for anno_k, anno_v in metadata_syn.items():
1840            # Remove keys with nan or empty string values or string that only contains white space from dict of annotations to be uploaded
1841            # if present on current data annotation
1842            if hide_blanks and (
1843                (isinstance(anno_v, str) and anno_v.strip() == "")
1844                or (isinstance(anno_v, float) and np.isnan(anno_v))
1845            ):
1846                annos["annotations"]["annotations"].pop(anno_k) if anno_k in annos[
1847                    "annotations"
1848                ]["annotations"].keys() else annos["annotations"]["annotations"]
1849                continue
1850
1851            # Otherwise save annotation as approrpriate
1852            if isinstance(anno_v, float) and np.isnan(anno_v):
1853                annos["annotations"]["annotations"][anno_k] = ""
1854                continue
1855
1856            # Handle strings that match the csv_list_regex and pass the validation rule
1857            if isinstance(anno_v, str) and re.fullmatch(csv_list_regex, anno_v):
1858                # Use a dictionary to dynamically choose the argument
1859                param = (
1860                    {"node_display_name": anno_k}
1861                    if annotation_keys == "display_label"
1862                    else {"node_label": anno_k}
1863                )
1864                node_validation_rules = dmge.get_node_validation_rules(**param)
1865
1866                if rule_in_rule_list("list", node_validation_rules):
1867                    annos["annotations"]["annotations"][anno_k] = anno_v.split(",")
1868                    continue
1869            # default: assign the original value
1870            annos["annotations"]["annotations"][anno_k] = anno_v
1871
1872        return annos
1873
1874    @async_missing_entity_handler
1875    async def format_row_annotations(
1876        self,
1877        dmge: DataModelGraphExplorer,
1878        row: pd.Series,
1879        entityId: str,
1880        hideBlanks: bool,
1881        annotation_keys: str,
1882    ) -> Union[None, Dict[str, Any]]:
1883        """Format row annotations
1884
1885        Args:
1886            dmge (DataModelGraphExplorer): data moodel graph explorer object
1887            row (pd.Series): row of the manifest
1888            entityId (str): entity id of the manifest
1889            hideBlanks (bool): when true, does not upload annotation keys with blank values. When false, upload Annotation keys with empty string values
1890            annotation_keys (str): display_label/class_label
1891
1892        Returns:
1893            Union[None, Dict[str,]]: if entity id is in trash can, return None. Otherwise, return the annotations
1894        """
1895        # prepare metadata for Synapse storage (resolve display name into a name that Synapse annotations support (e.g no spaces, parenthesis)
1896        # note: the removal of special characters, will apply only to annotation keys; we are not altering the manifest
1897        # this could create a divergence between manifest column and annotations. this should be ok for most use cases.
1898        # columns with special characters are outside of the schema
1899        metadataSyn = {}
1900        blacklist_chars = ["(", ")", ".", " ", "-"]
1901
1902        for k, v in row.to_dict().items():
1903            if annotation_keys == "display_label":
1904                keySyn = str(k).translate({ord(x): "" for x in blacklist_chars})
1905            elif annotation_keys == "class_label":
1906                keySyn = get_class_label_from_display_name(str(k)).translate(
1907                    {ord(x): "" for x in blacklist_chars}
1908                )
1909
1910            # Skip `Filename` and `ETag` columns when setting annotations
1911            if keySyn in ["Filename", "ETag", "eTag"]:
1912                continue
1913
1914            # truncate annotation values to 500 characters if the
1915            # size of values is greater than equal to 500 characters
1916            # add an explicit [truncatedByDataCuratorApp] message at the end
1917            # of every truncated message to indicate that the cell value
1918            # has been truncated
1919            if isinstance(v, str) and len(v) >= 500:
1920                v = v[0:472] + "[truncatedByDataCuratorApp]"
1921
1922            metadataSyn[keySyn] = v
1923
1924        # This will first check if the entity is already in memory, and if so, that
1925        # instance is used. Unfortunately, the expected return format needs to match
1926        # the Synapse API, so we need to convert the annotations to the expected format.
1927        entity = self.synapse_entity_tracker.get(
1928            synapse_id=entityId,
1929            syn=self.syn,
1930            download_file=False,
1931            retrieve_if_not_present=False,
1932        )
1933        if entity is not None:
1934            synapse_annotations = _convert_to_annotations_list(
1935                annotations=entity.annotations
1936            )
1937            annos = {
1938                "annotations": {
1939                    "id": entity.id,
1940                    "etag": entity.etag,
1941                    "annotations": synapse_annotations,
1942                }
1943            }
1944        else:
1945            annos = await self.get_async_annotation(entityId)
1946
1947        # set annotation(s) for the various objects/items in a dataset on Synapse
1948        csv_list_regex = comma_separated_list_regex()
1949
1950        annos = self.process_row_annotations(
1951            dmge=dmge,
1952            metadata_syn=metadataSyn,
1953            hide_blanks=hideBlanks,
1954            csv_list_regex=csv_list_regex,
1955            annos=annos,
1956            annotation_keys=annotation_keys,
1957        )
1958
1959        return annos
1960
1961    @missing_entity_handler
1962    @tracer.start_as_current_span("SynapseStorage::format_manifest_annotations")
1963    def format_manifest_annotations(self, manifest, manifest_synapse_id):
1964        """
1965        Set annotations for the manifest (as a whole) so they can be applied to the manifest table or csv.
1966        For now just getting the Component.
1967        """
1968
1969        entity = self.synapse_entity_tracker.get(
1970            synapse_id=manifest_synapse_id, syn=self.syn, download_file=False
1971        )
1972        is_file = entity.concreteType.endswith(".FileEntity")
1973        is_table = entity.concreteType.endswith(".TableEntity")
1974
1975        if is_file:
1976            # Get file metadata
1977            metadata = self.getFileAnnotations(manifest_synapse_id)
1978
1979            # If there is a defined component add it to the metadata.
1980            if "Component" in manifest.columns:
1981                # Gather component information
1982                component = manifest["Component"].unique()
1983
1984                # Double check that only a single component is listed, else raise an error.
1985                try:
1986                    len(component) == 1
1987                except ValueError as err:
1988                    raise ValueError(
1989                        f"Manifest has more than one component. Please check manifest and resubmit."
1990                    ) from err
1991
1992                # Add component to metadata
1993                metadata["Component"] = component[0]
1994
1995        elif is_table:
1996            # Get table metadata
1997            metadata = self.getTableAnnotations(manifest_synapse_id)
1998
1999        # Get annotations
2000        annos = OldAnnotations(
2001            id=entity.id, etag=entity.etag, values=entity.annotations
2002        )
2003
2004        # Add metadata to the annotations
2005        for annos_k, annos_v in metadata.items():
2006            annos[annos_k] = annos_v
2007
2008        return annos
2009
2010    '''
2011    def annotate_upload_manifest_table(self, manifest, datasetId, metadataManifestPath,
2012        useSchemaLabel: bool = True, hideBlanks: bool = False, restrict_manifest = False):
2013        """
2014        Purpose:
2015            Works very similarly to associateMetadataWithFiles except takes in the manifest
2016            rather than the manifest path
2017
2018        """
2019
2020        # Add uuid for table updates and fill.
2021        if not "Uuid" in manifest.columns:
2022            manifest["Uuid"] = ''
2023
2024        for idx,row in manifest.iterrows():
2025            if not row["Uuid"]:
2026                gen_uuid = uuid.uuid4()
2027                row["Uuid"] = gen_uuid
2028                manifest.loc[idx, 'Uuid'] = gen_uuid
2029
2030        # add entityId as a column if not already there or
2031        # fill any blanks with an empty string.
2032        if not "entityId" in manifest.columns:
2033            manifest["entityId"] = ""
2034        else:
2035            manifest["entityId"].fillna("", inplace=True)
2036
2037        # get a DataModelGraphExplorer object to ensure schema attribute names used in manifest are translated to schema labels for synapse annotations
2038        dmge = DataModelGraphExplorer()
2039
2040        # Create table name here.
2041        if 'Component' in manifest.columns:
2042            table_name = manifest['Component'][0].lower() + '_synapse_storage_manifest_table'
2043        else:
2044            table_name = 'synapse_storage_manifest_table'
2045
2046        # Upload manifest as a table and get the SynID and manifest
2047        manifest_synapse_table_id, manifest, table_manifest = self.upload_format_manifest_table(
2048                                                    dmge, manifest, datasetId, table_name, restrict = restrict_manifest, useSchemaLabel=useSchemaLabel,)
2049
2050        # Iterate over manifest rows, create Synapse entities and store corresponding entity IDs in manifest if needed
2051        # also set metadata for each synapse entity as Synapse annotations
2052        for idx, row in manifest.iterrows():
2053            if not row["entityId"]:
2054                # If not using entityIds, fill with manifest_table_id so
2055                row["entityId"] = manifest_synapse_table_id
2056                entityId = ''
2057            else:
2058                # get the entity id corresponding to this row
2059                entityId = row["entityId"]
2060
2061        # Load manifest to synapse as a CSV File
2062        manifest_synapse_file_id = self.upload_manifest_file(manifest, metadataManifestPath, datasetId, restrict_manifest)
2063
2064        # Get annotations for the file manifest.
2065        manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_file_id)
2066
2067        self.syn.set_annotations(manifest_annotations)
2068
2069        logger.info("Associated manifest file with dataset on Synapse.")
2070
2071        # Update manifest Synapse table with new entity id column.
2072        self.make_synapse_table(
2073            table_to_load = table_manifest,
2074            dataset_id = datasetId,
2075            existingTableId = manifest_synapse_table_id,
2076            table_name = table_name,
2077            update_col = 'Uuid',
2078            specify_schema = False,
2079            )
2080
2081        # Get annotations for the table manifest
2082        manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_table_id)
2083        self.syn.set_annotations(manifest_annotations)
2084        return manifest_synapse_table_id
2085    '''
2086
2087    def _read_manifest(self, metadataManifestPath: str) -> pd.DataFrame:
2088        """Helper function to read in provided manifest as a pandas DataFrame for subsequent downstream processing.
2089        Args:
2090            metadataManifestPath (str): path where manifest is stored
2091        Returns:
2092            manifest(pd.DataFrame): Manifest loaded as a pandas dataframe
2093        Raises:
2094            FileNotFoundError: Manifest file does not exist at provided path.
2095        """
2096        # read new manifest csv
2097        try:
2098            load_args = {
2099                "dtype": "string",
2100            }
2101            manifest = load_df(
2102                metadataManifestPath,
2103                preserve_raw_input=False,
2104                allow_na_values=False,
2105                **load_args,
2106            )
2107        except FileNotFoundError as err:
2108            raise FileNotFoundError(
2109                f"No manifest file was found at this path: {metadataManifestPath}"
2110            ) from err
2111        return manifest
2112
2113    def _add_id_columns_to_manifest(
2114        self, manifest: pd.DataFrame, dmge: DataModelGraphExplorer
2115    ):
2116        """Helper function to add id and entityId columns to the manifest if they do not already exist, Fill id values per row.
2117        Args:
2118            Manifest loaded as a pd.Dataframe
2119        Returns (pd.DataFrame):
2120            Manifest df with new Id and EntityId columns (and UUID values) if they were not already present.
2121        """
2122
2123        # Add Id for table updates and fill.
2124        if not col_in_dataframe("Id", manifest):
2125            # See if schema has `Uuid` column specified
2126            try:
2127                uuid_col_in_schema = dmge.is_class_in_schema(
2128                    "Uuid"
2129                ) or dmge.is_class_in_schema("uuid")
2130            except KeyError:
2131                uuid_col_in_schema = False
2132
2133            # Rename `Uuid` column if it wasn't specified in the schema
2134            if col_in_dataframe("Uuid", manifest) and not uuid_col_in_schema:
2135                manifest.rename(columns={"Uuid": "Id"}, inplace=True)
2136            # If no `Uuid` column exists or it is specified in the schema, create a new `Id` column
2137            else:
2138                manifest["Id"] = ""
2139
2140        # Retrieve the ID column name (id, Id and ID) are treated the same.
2141        id_col_name = [col for col in manifest.columns if col.lower() == "id"][0]
2142
2143        # Check if values have been added to the Id coulumn, if not add a UUID so value in the row is not blank.
2144        for idx, row in manifest.iterrows():
2145            if not row[id_col_name]:
2146                gen_uuid = str(uuid.uuid4())
2147                row[id_col_name] = gen_uuid
2148                manifest.loc[idx, id_col_name] = gen_uuid
2149
2150        # add entityId as a column if not already there or
2151        # fill any blanks with an empty string.
2152        if not col_in_dataframe("entityId", manifest):
2153            manifest["entityId"] = ""
2154        else:
2155            manifest["entityId"].fillna("", inplace=True)
2156
2157        return manifest
2158
2159    def _generate_table_name(self, manifest):
2160        """Helper function to generate a table name for upload to synapse.
2161
2162        Args:
2163            Manifest loaded as a pd.Dataframe
2164
2165        Returns:
2166            table_name (str): Name of the table to load
2167            component_name (str): Name of the manifest component (if applicable)
2168        """
2169        # Create table name here.
2170        if "Component" in manifest.columns:
2171            component_name = manifest["Component"][0].lower()
2172            table_name = component_name + "_synapse_storage_manifest_table"
2173        else:
2174            component_name = ""
2175            table_name = "synapse_storage_manifest_table"
2176        return table_name, component_name
2177
2178    def _create_entity_id(self, idx, row, manifest, datasetId):
2179        """Helper function to generate an entityId and add it to the appropriate row in the manifest.
2180        Args:
2181            row: current row of manifest being processed
2182            manifest (pd.DataFrame): loaded df containing user supplied data.
2183            datasetId (str): synapse ID of folder containing the dataset
2184
2185        Returns:
2186            manifest (pd.DataFrame): manifest with entityId added to the appropriate row
2187            entityId (str): Generated Entity Id.
2188
2189        """
2190        rowEntity = Folder(str(uuid.uuid4()), parent=datasetId)
2191        rowEntity = self.syn.store(rowEntity)
2192        entityId = rowEntity["id"]
2193        self.synapse_entity_tracker.add(synapse_id=entityId, entity=rowEntity)
2194        row["entityId"] = entityId
2195        manifest.loc[idx, "entityId"] = entityId
2196        return manifest, entityId
2197
2198    async def _process_store_annos(self, requests: Set[asyncio.Task]) -> None:
2199        """Process annotations and store them on synapse asynchronously
2200
2201        Args:
2202            requests (Set[asyncio.Task]): a set of tasks of formatting annotations created by format_row_annotations function in previous step
2203
2204        Raises:
2205            RuntimeError: raise a run time error if a task failed to complete
2206        """
2207        while requests:
2208            done_tasks, pending_tasks = await asyncio.wait(
2209                requests, return_when=asyncio.FIRST_COMPLETED
2210            )
2211            requests = pending_tasks
2212
2213            for completed_task in done_tasks:
2214                try:
2215                    annos = completed_task.result()
2216
2217                    if isinstance(annos, Annotations):
2218                        logger.info(f"Successfully stored annotations for {annos.id}")
2219                    else:
2220                        # store annotations if they are not None
2221                        if annos:
2222                            entity_id = annos["annotations"]["id"]
2223                            logger.info(
2224                                f"Obtained and processed annotations for {entity_id} entity"
2225                            )
2226                            requests.add(
2227                                asyncio.create_task(
2228                                    self.store_async_annotation(annotation_dict=annos)
2229                                )
2230                            )
2231                except Exception as e:
2232                    raise RuntimeError(f"failed with { repr(e) }.") from e
2233
2234    @tracer.start_as_current_span("SynapseStorage::add_annotations_to_entities_files")
2235    async def add_annotations_to_entities_files(
2236        self,
2237        dmge,
2238        manifest,
2239        manifest_record_type: str,
2240        datasetId: str,
2241        hideBlanks: bool,
2242        manifest_synapse_table_id="",
2243        annotation_keys: str = "class_label",
2244    ):
2245        """
2246        Depending on upload type add Ids to entityId row. Add anotations to connected
2247        files and folders. Despite the name of this function, it also applies to folders.
2248
2249        Args:
2250            dmge: DataModelGraphExplorer Object
2251            manifest (pd.DataFrame): loaded df containing user supplied data.
2252            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2253            datasetId (str): synapse ID of folder containing the dataset
2254            hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2255            manifest_synapse_table_id (str): Default is an empty string ''.
2256            annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display
2257                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2258                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2259        Returns:
2260            manifest (pd.DataFrame): modified to add entitiyId as appropriate
2261
2262        """
2263
2264        # Expected behavior is to annotate files if `Filename` is present and if file_annotations_upload is set to True regardless of `-mrt` setting
2265        if "filename" in [col.lower() for col in manifest.columns]:
2266            # get current list of files and store as dataframe
2267            dataset_files = self.getFilesInStorageDataset(datasetId)
2268            files_and_entityIds = self._get_file_entityIds(
2269                dataset_files=dataset_files, only_new_files=False
2270            )
2271            file_df = pd.DataFrame(files_and_entityIds)
2272
2273            # Merge dataframes to add entityIds
2274            manifest = manifest.merge(
2275                file_df, how="left", on="Filename", suffixes=["_x", None]
2276            ).drop("entityId_x", axis=1)
2277
2278        # Fill `entityId` for each row if missing and annotate entity as appropriate
2279        requests = set()
2280        for idx, row in manifest.iterrows():
2281            if not row["entityId"] and (
2282                manifest_record_type == "file_and_entities"
2283                or manifest_record_type == "table_file_and_entities"
2284            ):
2285                manifest, entityId = self._create_entity_id(
2286                    idx, row, manifest, datasetId
2287                )
2288            elif not row["entityId"] and manifest_record_type == "table_and_file":
2289                # If not using entityIds, fill with manifest_table_id so
2290                row["entityId"] = manifest_synapse_table_id
2291                manifest.loc[idx, "entityId"] = manifest_synapse_table_id
2292                entityId = ""
2293                # If the row is the manifest table, do not add annotations
2294            elif row["entityId"] == manifest_synapse_table_id:
2295                entityId = ""
2296            else:
2297                # get the file id of the file to annotate, collected in above step.
2298                entityId = row["entityId"]
2299
2300            # Adding annotations to connected files.
2301            if entityId:
2302                # Format annotations for Synapse
2303                annos_task = asyncio.create_task(
2304                    self.format_row_annotations(
2305                        dmge, row, entityId, hideBlanks, annotation_keys
2306                    )
2307                )
2308                requests.add(annos_task)
2309        await self._process_store_annos(requests)
2310        return manifest
2311
2312    @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_table")
2313    def upload_manifest_as_table(
2314        self,
2315        dmge: DataModelGraphExplorer,
2316        manifest: pd.DataFrame,
2317        metadataManifestPath: str,
2318        datasetId: str,
2319        table_name: str,
2320        component_name: str,
2321        restrict: bool,
2322        manifest_record_type: str,
2323        hideBlanks: bool,
2324        table_manipulation: str,
2325        table_column_names: str,
2326        annotation_keys: str,
2327        file_annotations_upload: bool = True,
2328    ):
2329        """Upload manifest to Synapse as a table and csv.
2330        Args:
2331            dmge: DataModelGraphExplorer object
2332            manifest (pd.DataFrame): loaded df containing user supplied data.
2333            metadataManifestPath: path to csv containing a validated metadata manifest.
2334            datasetId (str): synapse ID of folder containing the dataset
2335            table_name (str): Generated to name the table being uploaded.
2336            component_name (str): Name of the component manifest that is currently being uploaded.
2337            restrict (bool): Flag for censored data.
2338            manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2339            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2340            table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2341            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2342                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2343                display label formatting.
2344            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2345                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2346                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2347            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2348        Return:
2349            manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2350        """
2351        # Upload manifest as a table, get the ID and updated manifest.
2352        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2353            dmge=dmge,
2354            manifest=manifest,
2355            datasetId=datasetId,
2356            table_name=table_name,
2357            restrict=restrict,
2358            table_manipulation=table_manipulation,
2359            table_column_names=table_column_names,
2360        )
2361
2362        if file_annotations_upload:
2363            manifest = asyncio.run(
2364                self.add_annotations_to_entities_files(
2365                    dmge,
2366                    manifest,
2367                    manifest_record_type,
2368                    datasetId,
2369                    hideBlanks,
2370                    manifest_synapse_table_id,
2371                    annotation_keys,
2372                )
2373            )
2374        # Load manifest to synapse as a CSV File
2375        manifest_synapse_file_id = self.upload_manifest_file(
2376            manifest=manifest,
2377            metadataManifestPath=metadataManifestPath,
2378            datasetId=datasetId,
2379            restrict_manifest=restrict,
2380            component_name=component_name,
2381        )
2382
2383        # Set annotations for the file manifest.
2384        manifest_annotations = self.format_manifest_annotations(
2385            manifest=manifest, manifest_synapse_id=manifest_synapse_file_id
2386        )
2387        annos = self.syn.set_annotations(annotations=manifest_annotations)
2388        manifest_entity = self.synapse_entity_tracker.get(
2389            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2390        )
2391        manifest_entity.annotations = annos
2392        manifest_entity.etag = annos.etag
2393
2394        logger.info("Associated manifest file with dataset on Synapse.")
2395
2396        # Update manifest Synapse table with new entity id column.
2397        manifest_synapse_table_id, manifest, _ = self.uploadDB(
2398            dmge=dmge,
2399            manifest=manifest,
2400            datasetId=datasetId,
2401            table_name=table_name,
2402            restrict=restrict,
2403            table_manipulation="update",
2404            table_column_names=table_column_names,
2405        )
2406
2407        # Set annotations for the table manifest
2408        manifest_annotations = self.format_manifest_annotations(
2409            manifest=manifest, manifest_synapse_id=manifest_synapse_table_id
2410        )
2411        annotations_manifest_table = self.syn.set_annotations(
2412            annotations=manifest_annotations
2413        )
2414        manifest_table_entity = self.synapse_entity_tracker.get(
2415            synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False
2416        )
2417        manifest_table_entity.annotations = annotations_manifest_table
2418        manifest_table_entity.etag = annotations_manifest_table.etag
2419
2420        return manifest_synapse_file_id
2421
2422    @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_csv")
2423    def upload_manifest_as_csv(
2424        self,
2425        dmge,
2426        manifest,
2427        metadataManifestPath,
2428        datasetId,
2429        restrict,
2430        manifest_record_type,
2431        hideBlanks,
2432        component_name,
2433        annotation_keys: str,
2434        file_annotations_upload: bool = True,
2435    ):
2436        """Upload manifest to Synapse as a csv only.
2437        Args:
2438            dmge: DataModelGraphExplorer object
2439            manifest (pd.DataFrame): loaded df containing user supplied data.
2440            metadataManifestPath: path to csv containing a validated metadata manifest.
2441            datasetId (str): synapse ID of folder containing the dataset
2442            restrict (bool): Flag for censored data.
2443            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2444            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2445            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2446                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2447                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2448            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2449        Return:
2450            manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2451        """
2452        if file_annotations_upload:
2453            manifest = asyncio.run(
2454                self.add_annotations_to_entities_files(
2455                    dmge,
2456                    manifest,
2457                    manifest_record_type,
2458                    datasetId,
2459                    hideBlanks,
2460                    annotation_keys=annotation_keys,
2461                )
2462            )
2463
2464        # Load manifest to synapse as a CSV File
2465        manifest_synapse_file_id = self.upload_manifest_file(
2466            manifest,
2467            metadataManifestPath,
2468            datasetId,
2469            restrict,
2470            component_name=component_name,
2471        )
2472
2473        # Set annotations for the file manifest.
2474        manifest_annotations = self.format_manifest_annotations(
2475            manifest, manifest_synapse_file_id
2476        )
2477        annos = self.syn.set_annotations(manifest_annotations)
2478        manifest_entity = self.synapse_entity_tracker.get(
2479            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2480        )
2481        manifest_entity.annotations = annos
2482        manifest_entity.etag = annos.etag
2483
2484        logger.info("Associated manifest file with dataset on Synapse.")
2485
2486        return manifest_synapse_file_id
2487
2488    @tracer.start_as_current_span("SynapseStorage::upload_manifest_combo")
2489    def upload_manifest_combo(
2490        self,
2491        dmge,
2492        manifest,
2493        metadataManifestPath,
2494        datasetId,
2495        table_name,
2496        component_name,
2497        restrict,
2498        manifest_record_type,
2499        hideBlanks,
2500        table_manipulation,
2501        table_column_names: str,
2502        annotation_keys: str,
2503        file_annotations_upload: bool = True,
2504    ):
2505        """Upload manifest to Synapse as a table and CSV with entities.
2506        Args:
2507            dmge: DataModelGraphExplorer object
2508            manifest (pd.DataFrame): loaded df containing user supplied data.
2509            metadataManifestPath: path to csv containing a validated metadata manifest.
2510            datasetId (str): synapse ID of folder containing the dataset
2511            table_name (str): Generated to name the table being uploaded.
2512            component_name (str): Name of the component manifest that is currently being uploaded.
2513            restrict (bool): Flag for censored data.
2514            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2515            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2516            table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2517            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2518                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2519                display label formatting.
2520            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2521                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2522                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2523            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2524        Return:
2525            manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2526        """
2527        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2528            dmge=dmge,
2529            manifest=manifest,
2530            datasetId=datasetId,
2531            table_name=table_name,
2532            restrict=restrict,
2533            table_manipulation=table_manipulation,
2534            table_column_names=table_column_names,
2535        )
2536
2537        if file_annotations_upload:
2538            manifest = asyncio.run(
2539                self.add_annotations_to_entities_files(
2540                    dmge,
2541                    manifest,
2542                    manifest_record_type,
2543                    datasetId,
2544                    hideBlanks,
2545                    manifest_synapse_table_id,
2546                    annotation_keys=annotation_keys,
2547                )
2548            )
2549
2550        # Load manifest to synapse as a CSV File
2551        manifest_synapse_file_id = self.upload_manifest_file(
2552            manifest, metadataManifestPath, datasetId, restrict, component_name
2553        )
2554
2555        # Set annotations for the file manifest.
2556        manifest_annotations = self.format_manifest_annotations(
2557            manifest, manifest_synapse_file_id
2558        )
2559        file_manifest_annoations = self.syn.set_annotations(manifest_annotations)
2560        manifest_entity = self.synapse_entity_tracker.get(
2561            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2562        )
2563        manifest_entity.annotations = file_manifest_annoations
2564        manifest_entity.etag = file_manifest_annoations.etag
2565        logger.info("Associated manifest file with dataset on Synapse.")
2566
2567        # Update manifest Synapse table with new entity id column.
2568        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2569            dmge=dmge,
2570            manifest=manifest,
2571            datasetId=datasetId,
2572            table_name=table_name,
2573            restrict=restrict,
2574            table_manipulation="update",
2575            table_column_names=table_column_names,
2576        )
2577
2578        # Set annotations for the table manifest
2579        manifest_annotations = self.format_manifest_annotations(
2580            manifest, manifest_synapse_table_id
2581        )
2582        table_manifest_annotations = self.syn.set_annotations(manifest_annotations)
2583        manifest_entity = self.synapse_entity_tracker.get(
2584            synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False
2585        )
2586        manifest_entity.annotations = table_manifest_annotations
2587        manifest_entity.etag = table_manifest_annotations.etag
2588        return manifest_synapse_file_id
2589
2590    @tracer.start_as_current_span("SynapseStorage::associateMetadataWithFiles")
2591    def associateMetadataWithFiles(
2592        self,
2593        dmge: DataModelGraphExplorer,
2594        metadataManifestPath: str,
2595        datasetId: str,
2596        manifest_record_type: str = "table_file_and_entities",
2597        hideBlanks: bool = False,
2598        restrict_manifest=False,
2599        table_manipulation: str = "replace",
2600        table_column_names: str = "class_label",
2601        annotation_keys: str = "class_label",
2602        file_annotations_upload: bool = True,
2603    ) -> str:
2604        """Associate metadata with files in a storage dataset already on Synapse.
2605        Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file.
2606
2607        If this is a new manifest there could be no Synapse entities associated with the rows of this manifest
2608        this may be due to data type (e.g. clinical data) being tabular
2609        and not requiring files; to utilize uniform interfaces downstream
2610        (i.e. fileviews), a Synapse entity (a folder) is created for each row
2611        and an entity column is added to the manifest containing the resulting
2612        entity IDs; a table is also created at present as an additional interface
2613        for downstream query and interaction with the data.
2614
2615        Args:
2616            dmge: DataModelGraphExplorer Object
2617            metadataManifestPath: path to csv containing a validated metadata manifest.
2618            The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type.
2619            Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item.
2620            In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file.
2621            datasetId: synapse ID of folder containing the dataset
2622            manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination.
2623            hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2624            restrict_manifest (bool): Default is false. Flag for censored data.
2625            table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2626            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2627                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2628                display label formatting.
2629            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2630                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2631                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2632        Returns:
2633            manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2634        """
2635        # Read new manifest CSV:
2636        manifest = self._read_manifest(metadataManifestPath)
2637        manifest = self._add_id_columns_to_manifest(manifest, dmge)
2638
2639        table_name, component_name = self._generate_table_name(manifest)
2640
2641        # Upload manifest to synapse based on user input (manifest_record_type)
2642        if manifest_record_type == "file_only":
2643            manifest_synapse_file_id = self.upload_manifest_as_csv(
2644                dmge=dmge,
2645                manifest=manifest,
2646                metadataManifestPath=metadataManifestPath,
2647                datasetId=datasetId,
2648                restrict=restrict_manifest,
2649                hideBlanks=hideBlanks,
2650                manifest_record_type=manifest_record_type,
2651                component_name=component_name,
2652                annotation_keys=annotation_keys,
2653                file_annotations_upload=file_annotations_upload,
2654            )
2655        elif manifest_record_type == "table_and_file":
2656            manifest_synapse_file_id = self.upload_manifest_as_table(
2657                dmge=dmge,
2658                manifest=manifest,
2659                metadataManifestPath=metadataManifestPath,
2660                datasetId=datasetId,
2661                table_name=table_name,
2662                component_name=component_name,
2663                restrict=restrict_manifest,
2664                hideBlanks=hideBlanks,
2665                manifest_record_type=manifest_record_type,
2666                table_manipulation=table_manipulation,
2667                table_column_names=table_column_names,
2668                annotation_keys=annotation_keys,
2669                file_annotations_upload=file_annotations_upload,
2670            )
2671        elif manifest_record_type == "file_and_entities":
2672            manifest_synapse_file_id = self.upload_manifest_as_csv(
2673                dmge=dmge,
2674                manifest=manifest,
2675                metadataManifestPath=metadataManifestPath,
2676                datasetId=datasetId,
2677                restrict=restrict_manifest,
2678                hideBlanks=hideBlanks,
2679                manifest_record_type=manifest_record_type,
2680                component_name=component_name,
2681                annotation_keys=annotation_keys,
2682                file_annotations_upload=file_annotations_upload,
2683            )
2684        elif manifest_record_type == "table_file_and_entities":
2685            manifest_synapse_file_id = self.upload_manifest_combo(
2686                dmge=dmge,
2687                manifest=manifest,
2688                metadataManifestPath=metadataManifestPath,
2689                datasetId=datasetId,
2690                table_name=table_name,
2691                component_name=component_name,
2692                restrict=restrict_manifest,
2693                hideBlanks=hideBlanks,
2694                manifest_record_type=manifest_record_type,
2695                table_manipulation=table_manipulation,
2696                table_column_names=table_column_names,
2697                annotation_keys=annotation_keys,
2698                file_annotations_upload=file_annotations_upload,
2699            )
2700        else:
2701            raise ValueError("Please enter a valid manifest_record_type.")
2702        return manifest_synapse_file_id
2703
2704    def getTableAnnotations(self, table_id: str):
2705        """Generate dictionary of annotations for the given Synapse file.
2706        Synapse returns all custom annotations as lists since they
2707        can contain multiple values. In all cases, the values will
2708        be converted into strings and concatenated with ", ".
2709
2710        Args:
2711            fileId (str): Synapse ID for dataset file.
2712
2713        Returns:
2714            dict: Annotations as comma-separated strings.
2715        """
2716        try:
2717            entity = self.synapse_entity_tracker.get(
2718                synapse_id=table_id, syn=self.syn, download_file=False
2719            )
2720            is_table = entity.concreteType.endswith(".TableEntity")
2721            annotations_raw = entity.annotations
2722        except SynapseHTTPError:
2723            # If an error occurs with retrieving entity, skip it
2724            # This could be caused by a temporary file view that
2725            # was deleted since its ID was retrieved
2726            is_file, is_table = False, False
2727
2728        # Skip anything that isn't a file or folder
2729        if not (is_table):
2730            return None
2731
2732        annotations = self.getEntityAnnotations(table_id, entity, annotations_raw)
2733
2734        return annotations
2735
2736    def getFileAnnotations(self, fileId: str) -> Dict[str, str]:
2737        """Generate dictionary of annotations for the given Synapse file.
2738        Synapse returns all custom annotations as lists since they
2739        can contain multiple values. In all cases, the values will
2740        be converted into strings and concatenated with ", ".
2741
2742        Args:
2743            fileId (str): Synapse ID for dataset file.
2744
2745        Returns:
2746            dict: Annotations as comma-separated strings.
2747        """
2748
2749        # Get entity metadata, including annotations
2750        try:
2751            entity = self.synapse_entity_tracker.get(
2752                synapse_id=fileId, syn=self.syn, download_file=False
2753            )
2754            is_file = entity.concreteType.endswith(".FileEntity")
2755            is_folder = entity.concreteType.endswith(".Folder")
2756            annotations_raw = entity.annotations
2757        except SynapseHTTPError:
2758            # If an error occurs with retrieving entity, skip it
2759            # This could be caused by a temporary file view that
2760            # was deleted since its ID was retrieved
2761            is_file, is_folder = False, False
2762
2763        # Skip anything that isn't a file or folder
2764        if not (is_file or is_folder):
2765            return None
2766
2767        annotations = self.getEntityAnnotations(fileId, entity, annotations_raw)
2768
2769        return annotations
2770
2771    def getEntityAnnotations(self, fileId, entity, annotations_raw):
2772        # Extract annotations from their lists and stringify. For example:
2773        # {'YearofBirth': [1980], 'author': ['bruno', 'milen', 'sujay']}
2774        annotations = dict()
2775        for key, vals in annotations_raw.items():
2776            if isinstance(vals, list) and len(vals) == 1:
2777                annotations[key] = str(vals[0])
2778            else:
2779                annotations[key] = ", ".join(str(v) for v in vals)
2780
2781        # Add the file entity ID and eTag, which weren't lists
2782        assert fileId == entity.id, (
2783            "For some reason, the Synapse ID in the response doesn't match"
2784            "the Synapse ID sent in the request (via synapseclient)."
2785        )
2786        annotations["entityId"] = fileId
2787        annotations["eTag"] = entity.etag
2788
2789        return annotations
2790
2791    def getDatasetAnnotations(
2792        self, datasetId: str, fill_na: bool = True, force_batch: bool = False
2793    ) -> pd.DataFrame:
2794        """Generate table for annotations across all files in given dataset.
2795
2796        Args:
2797            datasetId (str): Synapse ID for dataset folder.
2798            fill_na (bool): Whether to replace missing values with
2799                blank strings.
2800            force_batch (bool): Whether to force the function to use
2801                the batch mode, which uses a file view to retrieve
2802                annotations for a given dataset. Default to False
2803                unless there are more than 50 files in the dataset.
2804
2805        Returns:
2806            pd.DataFrame: Table of annotations.
2807        """
2808        # Get all files in given dataset
2809        dataset_files = self.getFilesInStorageDataset(datasetId)
2810
2811        # if there are no dataset files, there are no annotations
2812        # return None
2813        if not dataset_files:
2814            return pd.DataFrame()
2815
2816        dataset_files_map = dict(dataset_files)
2817        dataset_file_ids, _ = list(zip(*dataset_files))
2818
2819        # Get annotations for each file from Step 1
2820        # Batch mode
2821        try_batch = len(dataset_files) >= 50 or force_batch
2822        if try_batch:
2823            try:
2824                logger.info("Trying batch mode for retrieving Synapse annotations")
2825                table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids)
2826            except (SynapseAuthenticationError, SynapseHTTPError, ValueError):
2827                logger.info(
2828                    f"Unable to create a temporary file view bound to {datasetId}. "
2829                    "Defaulting to slower iterative retrieval of annotations."
2830                )
2831                # Default to the slower non-batch method
2832                logger.info("Batch mode failed (probably due to permission error)")
2833                try_batch = False
2834
2835        # Non-batch mode
2836        if not try_batch:
2837            logger.info("Using slower (non-batch) sequential mode")
2838            records = [self.getFileAnnotations(i) for i in dataset_file_ids]
2839            # Remove any annotations for non-file/folders (stored as None)
2840            records = filter(None, records)
2841            table = pd.DataFrame.from_records(records)
2842
2843        # Add filenames for the files that "survived" annotation retrieval
2844        filenames = [dataset_files_map[i] for i in table["entityId"]]
2845
2846        if "Filename" not in table.columns:
2847            table.insert(0, "Filename", filenames)
2848
2849        # Ensure that entityId and eTag are at the end
2850        entity_ids = table.pop("entityId")
2851        etags = table.pop("eTag")
2852        table.insert(len(table.columns), "entityId", entity_ids)
2853        table.insert(len(table.columns), "eTag", etags)
2854
2855        # Missing values are filled in with empty strings for Google Sheets
2856        if fill_na:
2857            table.fillna("", inplace=True)
2858
2859        # Force all values as strings
2860        return table.astype(str)
2861
2862    def raise_final_error(retry_state):
2863        return retry_state.outcome.result()
2864
2865    def checkIfinAssetView(self, syn_id) -> str:
2866        # get data in administrative fileview for this pipeline
2867        assetViewTable = self.getStorageFileviewTable()
2868        all_files = list(assetViewTable["id"])
2869        if syn_id in all_files:
2870            return True
2871        else:
2872            return False
2873
2874    @tracer.start_as_current_span("SynapseStorage::getDatasetProject")
2875    @retry(
2876        stop=stop_after_attempt(5),
2877        wait=wait_chain(
2878            *[wait_fixed(10) for i in range(2)]
2879            + [wait_fixed(15) for i in range(2)]
2880            + [wait_fixed(20)]
2881        ),
2882        retry=retry_if_exception_type(LookupError),
2883        retry_error_callback=raise_final_error,
2884    )
2885    def getDatasetProject(self, datasetId: str) -> str:
2886        """Get parent project for a given dataset ID.
2887
2888        Args:
2889            datasetId (str): Synapse entity ID (folder or project).
2890
2891        Raises:
2892            ValueError: Raised if Synapse ID cannot be retrieved
2893            by the user or if it doesn't appear in the file view.
2894
2895        Returns:
2896            str: The Synapse ID for the parent project.
2897        """
2898
2899        # Subset main file view
2900        dataset_index = self.storageFileviewTable["id"] == datasetId
2901        dataset_row = self.storageFileviewTable[dataset_index]
2902
2903        # re-query if no datasets found
2904        if dataset_row.empty:
2905            sleep(5)
2906            self.query_fileview(force_requery=True)
2907            # Subset main file view
2908            dataset_index = self.storageFileviewTable["id"] == datasetId
2909            dataset_row = self.storageFileviewTable[dataset_index]
2910
2911        # Return `projectId` for given row if only one found
2912        if len(dataset_row) == 1:
2913            dataset_project = dataset_row["projectId"].values[0]
2914            return dataset_project
2915
2916        # Otherwise, check if already project itself
2917        try:
2918            syn_object = self.synapse_entity_tracker.get(
2919                synapse_id=datasetId, syn=self.syn, download_file=False
2920            )
2921            if syn_object.properties["concreteType"].endswith("Project"):
2922                return datasetId
2923        except SynapseHTTPError:
2924            raise PermissionError(
2925                f"The given dataset ({datasetId}) isn't accessible with this "
2926                "user. This might be caused by a typo in the dataset Synapse ID."
2927            )
2928
2929        # If not, then assume dataset not in file view
2930        raise LookupError(
2931            f"The given dataset ({datasetId}) doesn't appear in the "
2932            f"configured file view ({self.storageFileview}). This might "
2933            "mean that the file view's scope needs to be updated."
2934        )
2935
2936    def getDatasetAnnotationsBatch(
2937        self, datasetId: str, dataset_file_ids: Sequence[str] = None
2938    ) -> pd.DataFrame:
2939        """Generate table for annotations across all files in given dataset.
2940        This function uses a temporary file view to generate a table
2941        instead of iteratively querying for individual entity annotations.
2942        This function is expected to run much faster than
2943        `self.getDatasetAnnotationsBatch` on large datasets.
2944
2945        Args:
2946            datasetId (str): Synapse ID for dataset folder.
2947            dataset_file_ids (Sequence[str]): List of Synapse IDs
2948                for dataset files/folders used to subset the table.
2949
2950        Returns:
2951            pd.DataFrame: Table of annotations.
2952        """
2953        # Create data frame from annotations file view
2954        with DatasetFileView(datasetId, self.syn) as fileview:
2955            table = fileview.query()
2956
2957        if dataset_file_ids:
2958            table = table.loc[table.index.intersection(dataset_file_ids)]
2959
2960        table = table.reset_index(drop=True)
2961
2962        return table
2963
2964    def _get_table_schema_by_cname(self, table_schema):
2965        # assume no duplicate column names in the table
2966        table_schema_by_cname = {}
2967
2968        for col_record in table_schema:
2969            # TODO clean up dictionary for compactness (e.g. remove redundant 'name' key)
2970            table_schema_by_cname[col_record["name"]] = col_record
2971
2972        return table_schema_by_cname

Implementation of Storage interface for datasets/files stored on Synapse. Provides utilities to list files in a specific project; update files annotations, create fileviews, etc.

TODO: Need to define the interface and rename and/or refactor some of the methods below.

@tracer.start_as_current_span('SynapseStorage::__init__')
SynapseStorage( token: Optional[str] = None, access_token: Optional[str] = None, project_scope: Optional[list] = None, synapse_cache_path: Optional[str] = None, perform_query: Optional[bool] = True, columns: Optional[list] = None, where_clauses: Optional[list] = None)
298    @tracer.start_as_current_span("SynapseStorage::__init__")
299    def __init__(
300        self,
301        token: Optional[str] = None,  # optional parameter retrieved from browser cookie
302        access_token: Optional[str] = None,
303        project_scope: Optional[list] = None,
304        synapse_cache_path: Optional[str] = None,
305        perform_query: Optional[bool] = True,
306        columns: Optional[list] = None,
307        where_clauses: Optional[list] = None,
308    ) -> None:
309        """Initializes a SynapseStorage object.
310
311        Args:
312            token (Optional[str], optional):
313              Optional token parameter as found in browser cookie upon login to synapse.
314              Defaults to None.
315            access_token (Optional[list], optional):
316              Optional access token (personal or oauth).
317              Defaults to None.
318            project_scope (Optional[list], optional): Defaults to None.
319            synapse_cache_path (Optional[str], optional):
320              Location of synapse cache.
321              Defaults to None.
322        TODO:
323            Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how `query_fileview` is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands.
324        """
325        self.syn = self.login(synapse_cache_path, access_token)
326        self.project_scope = project_scope
327        self.storageFileview = CONFIG.synapse_master_fileview_id
328        self.manifest = CONFIG.synapse_manifest_basename
329        self.root_synapse_cache = self.syn.cache.cache_root_dir
330        self.synapse_entity_tracker = SynapseEntityTracker()
331        if perform_query:
332            self.query_fileview(columns=columns, where_clauses=where_clauses)

Initializes a SynapseStorage object.

Arguments:
  • token (Optional[str], optional): Optional token parameter as found in browser cookie upon login to synapse. Defaults to None.
  • access_token (Optional[list], optional): Optional access token (personal or oauth). Defaults to None.
  • project_scope (Optional[list], optional): Defaults to None.
  • synapse_cache_path (Optional[str], optional): Location of synapse cache. Defaults to None.
TODO:

Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how query_fileview is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands.

syn
project_scope
storageFileview
manifest
root_synapse_cache
synapse_entity_tracker
@tracer.start_as_current_span('SynapseStorage::query_fileview')
def query_fileview( self, columns: Optional[list] = None, where_clauses: Optional[list] = None, force_requery: Optional[bool] = False) -> None:
371    @tracer.start_as_current_span("SynapseStorage::query_fileview")
372    def query_fileview(
373        self,
374        columns: Optional[list] = None,
375        where_clauses: Optional[list] = None,
376        force_requery: Optional[bool] = False,
377    ) -> None:
378        """
379        Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute.
380        Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes.
381        Args:
382            columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
383            where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
384            force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False.
385        """
386        self._purge_synapse_cache()
387
388        # Initialize to assume that the new fileview query will be different from what may already be stored. Initializes to True because generally one will not have already been performed
389        self.new_query_different = True
390
391        # If a query has already been performed, store the query
392        previous_query_built = hasattr(self, "fileview_query")
393        if previous_query_built:
394            previous_query = self.fileview_query
395
396        # Build a query with the current given parameters and check to see if it is different from the previous
397        self._build_query(columns=columns, where_clauses=where_clauses)
398        if previous_query_built:
399            self.new_query_different = self.fileview_query != previous_query
400
401        # Only perform the query if it is different from the previous query or we are forcing new results to be retrieved
402        if self.new_query_different or force_requery:
403            try:
404                self.storageFileviewTable = self.syn.tableQuery(
405                    query=self.fileview_query,
406                ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
407            except SynapseHTTPError as exc:
408                exception_text = str(exc)
409                if "Unknown column path" in exception_text:
410                    raise ValueError(
411                        "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation."
412                    )
413                elif "Unknown column" in exception_text:
414                    missing_column = exception_text.split("Unknown column ")[-1]
415                    raise ValueError(
416                        f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview."
417                    )
418                else:
419                    raise AccessCredentialsError(self.storageFileview)

Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute. Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes.

Arguments:
  • columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
  • where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
  • force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False.
@staticmethod
def build_clause_from_dataset_id( dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None) -> str:
421    @staticmethod
422    def build_clause_from_dataset_id(
423        dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None
424    ) -> str:
425        """
426        Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized.
427        Args:
428            dataset_id: Synapse ID of a dataset that should be used to limit the query
429            dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query
430        Returns:
431            clause for the query or an empty string if no dataset ID is provided
432        """
433        # Calling this method without specifying synIDs will complete but will not scope the view
434        if (not dataset_id) and (not dataset_folder_list):
435            return ""
436
437        # This will be used to gather files under a dataset recursively with a fileview query instead of walking
438        if dataset_folder_list:
439            search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list)
440            return f"parentId IN ({search_folders})"
441
442        # `dataset_id` should be provided when all files are stored directly under the dataset folder
443        return f"parentId='{dataset_id}'"

Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized.

Arguments:
  • dataset_id: Synapse ID of a dataset that should be used to limit the query
  • dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query
Returns:

clause for the query or an empty string if no dataset ID is provided

@staticmethod
@tracer.start_as_current_span('SynapseStorage::login')
def login( synapse_cache_path: Optional[str] = None, access_token: Optional[str] = None) -> synapseclient.client.Synapse:
483    @staticmethod
484    @tracer.start_as_current_span("SynapseStorage::login")
485    def login(
486        synapse_cache_path: Optional[str] = None,
487        access_token: Optional[str] = None,
488    ) -> synapseclient.Synapse:
489        """Login to Synapse
490
491        Args:
492            access_token (Optional[str], optional): A synapse access token. Defaults to None.
493            synapse_cache_path (Optional[str]): location of synapse cache
494
495        Raises:
496            ValueError: If unable to loging with access token
497
498        Returns:
499            synapseclient.Synapse: A Synapse object that is logged in
500        """
501        if not access_token:
502            access_token = os.getenv("SYNAPSE_ACCESS_TOKEN")
503
504        # login using a token
505        if access_token:
506            try:
507                syn = synapseclient.Synapse(
508                    cache_root_dir=synapse_cache_path,
509                    debug=False,
510                    skip_checks=True,
511                    cache_client=False,
512                )
513                syn.login(authToken=access_token, silent=True)
514            except SynapseHTTPError as exc:
515                raise ValueError(
516                    "No access to resources. Please make sure that your token is correct"
517                ) from exc
518        else:
519            # login using synapse credentials provided by user in .synapseConfig (default) file
520            syn = synapseclient.Synapse(
521                configPath=CONFIG.synapse_configuration_path,
522                cache_root_dir=synapse_cache_path,
523                debug=False,
524                skip_checks=True,
525                cache_client=False,
526            )
527            syn.login(silent=True)
528
529        # set user id attribute
530        current_span = trace.get_current_span()
531        if current_span.is_recording():
532            current_span.set_attribute("user.id", syn.credentials.owner_id)
533
534        return syn

Login to Synapse

Arguments:
  • access_token (Optional[str], optional): A synapse access token. Defaults to None.
  • synapse_cache_path (Optional[str]): location of synapse cache
Raises:
  • ValueError: If unable to loging with access token
Returns:

synapseclient.Synapse: A Synapse object that is logged in

def missing_entity_handler(method):
536    def missing_entity_handler(method):
537        def wrapper(*args, **kwargs):
538            try:
539                return method(*args, **kwargs)
540            except SynapseHTTPError as ex:
541                str_message = str(ex).replace("\n", "")
542                if "trash" in str_message or "does not exist" in str_message:
543                    logging.warning(str_message)
544                    return None
545                else:
546                    raise ex
547
548        return wrapper
def async_missing_entity_handler(method):
550    def async_missing_entity_handler(method):
551        """Decorator to handle missing entities in async methods."""
552
553        async def wrapper(*args: Any, **kwargs: Any) -> Any:
554            try:
555                return await method(*args, **kwargs)
556            except SynapseHTTPError as ex:
557                str_message = str(ex).replace("\n", "")
558                if "trash" in str_message or "does not exist" in str_message:
559                    logging.warning(str_message)
560                    return None
561                else:
562                    raise ex
563
564        return wrapper

Decorator to handle missing entities in async methods.

def getStorageFileviewTable(self):
566    def getStorageFileviewTable(self):
567        """Returns the storageFileviewTable obtained during initialization."""
568        return self.storageFileviewTable

Returns the storageFileviewTable obtained during initialization.

def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]:
570    def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]:
571        """Gets the paginated results of the REST call to Synapse to check what projects the current user has access to.
572
573        Args:
574            currentUserId: synapse id for the user whose projects we want to get.
575
576        Returns:
577            A dictionary with a next page token and the results.
578        """
579        all_results = self.syn.restGET(
580            "/projects/user/{principalId}".format(principalId=currentUserId)
581        )
582
583        while (
584            "nextPageToken" in all_results
585        ):  # iterate over next page token in results while there is any
586            results_token = self.syn.restGET(
587                "/projects/user/{principalId}?nextPageToken={nextPageToken}".format(
588                    principalId=currentUserId,
589                    nextPageToken=all_results["nextPageToken"],
590                )
591            )
592            all_results["results"].extend(results_token["results"])
593
594            if "nextPageToken" in results_token:
595                all_results["nextPageToken"] = results_token["nextPageToken"]
596            else:
597                del all_results["nextPageToken"]
598
599        return all_results

Gets the paginated results of the REST call to Synapse to check what projects the current user has access to.

Arguments:
  • currentUserId: synapse id for the user whose projects we want to get.
Returns:

A dictionary with a next page token and the results.

@tracer.start_as_current_span('SynapseStorage::getStorageProjects')
def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]:
601    @tracer.start_as_current_span("SynapseStorage::getStorageProjects")
602    def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]:
603        """Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute.
604
605        Returns:
606            A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName).
607        """
608
609        # get the set of all storage Synapse project accessible for this pipeline
610        storageProjects = self.storageFileviewTable["projectId"].unique()
611
612        # get the set of storage Synapse project accessible for this user
613        # get a list of projects from Synapse
614        current_user_project_headers = self.synapse_entity_tracker.get_project_headers(
615            current_user_id=self.syn.credentials.owner_id, syn=self.syn
616        )
617        project_id_to_name_dict = {}
618        current_user_projects = []
619        for project_header in current_user_project_headers:
620            project_id_to_name_dict[project_header.get("id")] = project_header.get(
621                "name"
622            )
623            current_user_projects.append(project_header.get("id"))
624
625        # find set of user projects that are also in this pipeline's storage projects set
626        storageProjects = list(set(storageProjects) & set(current_user_projects))
627
628        # Limit projects to scope if specified
629        if project_scope:
630            storageProjects = list(set(storageProjects) & set(project_scope))
631
632            if not storageProjects:
633                raise Warning(
634                    f"There are no projects that the user has access to that match the criteria of the specified project scope: {project_scope}"
635                )
636
637        # prepare a return list of project IDs and names
638        projects = []
639        for projectId in storageProjects:
640            project_name_from_project_header = project_id_to_name_dict.get(projectId)
641            projects.append((projectId, project_name_from_project_header))
642
643        sorted_projects_list = sorted(projects, key=lambda tup: tup[0])
644
645        return sorted_projects_list

Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute.

Returns:

A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName).

@tracer.start_as_current_span('SynapseStorage::getStorageDatasetsInProject')
def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]:
647    @tracer.start_as_current_span("SynapseStorage::getStorageDatasetsInProject")
648    def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]:
649        """Gets all datasets in folder under a given storage project that the current user has access to.
650
651        Args:
652            projectId: synapse ID of a storage project.
653
654        Returns:
655            A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName).
656            None: If the projectId cannot be found on Synapse.
657        """
658
659        # select all folders and fetch their names from within the storage project;
660        # if folder content type is defined, only select folders that contain datasets
661        if "contentType" in self.storageFileviewTable.columns:
662            foldersTable = self.storageFileviewTable[
663                (self.storageFileviewTable["contentType"] == "dataset")
664                & (self.storageFileviewTable["projectId"] == projectId)
665            ]
666        else:
667            foldersTable = self.storageFileviewTable[
668                (self.storageFileviewTable["type"] == "folder")
669                & (self.storageFileviewTable["parentId"] == projectId)
670            ]
671
672        # get an array of tuples (folderId, folderName)
673        # some folders are part of datasets; others contain datasets
674        # each dataset parent is the project; folders part of a dataset have another folder as a parent
675        # to get folders if and only if they contain datasets for each folder
676        # check if folder's parent is the project; if so that folder contains a dataset,
677        # unless the folder list has already been filtered to dataset folders based on contentType attribute above
678
679        datasetList = []
680        folderProperties = ["id", "name"]
681        for folder in list(
682            foldersTable[folderProperties].itertuples(index=False, name=None)
683        ):
684            datasetList.append(folder)
685
686        sorted_dataset_list = sorted(datasetList, key=lambda tup: tup[0])
687
688        return sorted_dataset_list

Gets all datasets in folder under a given storage project that the current user has access to.

Arguments:
  • projectId: synapse ID of a storage project.
Returns:

A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName). None: If the projectId cannot be found on Synapse.

@tracer.start_as_current_span('SynapseStorage::getFilesInStorageDataset')
def getFilesInStorageDataset( self, datasetId: str, fileNames: List = None, fullpath: bool = True) -> List[Tuple[str, str]]:
690    @tracer.start_as_current_span("SynapseStorage::getFilesInStorageDataset")
691    def getFilesInStorageDataset(
692        self, datasetId: str, fileNames: List = None, fullpath: bool = True
693    ) -> List[Tuple[str, str]]:
694        """Gets all files (excluding manifest files) in a given dataset folder.
695
696        Args:
697            datasetId: synapse ID of a storage dataset.
698            fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g.
699            metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present.
700            fullpath: if True return the full path as part of this filename; otherwise return just base filename
701
702        Returns:
703            A list of files; the list consists of tuples (fileId, fileName).
704
705        Raises:
706            ValueError: Dataset ID not found.
707        """
708        file_list = []
709
710        # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view
711        if self.storageFileviewTable.empty:
712            raise ValueError(
713                f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again."
714            )
715
716        child_path = self.storageFileviewTable.loc[
717            self.storageFileviewTable["parentId"] == datasetId, "path"
718        ]
719        if child_path.empty:
720            raise LookupError(
721                f"Dataset {datasetId} could not be found in fileview {self.storageFileview}."
722            )
723        child_path = child_path.iloc[0]
724
725        # Get the dataset path by eliminating the child's portion of the path to account for nested datasets
726        parent = child_path.split("/")[:-1]
727        parent = "/".join(parent)
728
729        # Format dataset path to be used in table query
730        dataset_path = f"'{parent}/%'"
731
732        # When querying, only include files to exclude entity files and subdirectories
733        where_clauses = [f"path like {dataset_path}", "type='file'"]
734
735        # Requery the fileview to specifically get the files in the given dataset
736        self.query_fileview(columns=["id", "path"], where_clauses=where_clauses)
737
738        # Exclude manifest files
739        non_manifest_files = self.storageFileviewTable.loc[
740            ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"),
741            :,
742        ]
743
744        # Remove all files that are not in the list of fileNames
745        if fileNames:
746            filename_regex = "|".join(fileNames)
747
748            matching_files = non_manifest_files["path"].str.contains(
749                filename_regex, case=False, regex=True
750            )
751
752            non_manifest_files = non_manifest_files.loc[matching_files, :]
753
754        # Truncate path if necessary
755        if not fullpath:
756            non_manifest_files.path = non_manifest_files.path.apply(os.path.basename)
757
758        # Return list of files as expected by other methods
759        file_list = list(non_manifest_files.itertuples(index=False, name=None))
760
761        return file_list

Gets all files (excluding manifest files) in a given dataset folder.

Arguments:
  • datasetId: synapse ID of a storage dataset.
  • fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g.
  • metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present.
  • fullpath: if True return the full path as part of this filename; otherwise return just base filename
Returns:

A list of files; the list consists of tuples (fileId, fileName).

Raises:
  • ValueError: Dataset ID not found.
@tracer.start_as_current_span('SynapseStorage::getDatasetManifest')
def getDatasetManifest( self, datasetId: str, downloadFile: bool = False, newManifestName: str = '', use_temporary_folder: bool = True) -> Union[str, synapseclient.entity.File]:
788    @tracer.start_as_current_span("SynapseStorage::getDatasetManifest")
789    def getDatasetManifest(
790        self,
791        datasetId: str,
792        downloadFile: bool = False,
793        newManifestName: str = "",
794        use_temporary_folder: bool = True,
795    ) -> Union[str, File]:
796        """Gets the manifest associated with a given dataset.
797
798        Args:
799            datasetId: synapse ID of a storage dataset.
800            downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not.
801            newManifestName: new name of a manifest that gets downloaded
802            use_temporary_folder: boolean argument indicating if a temporary folder
803                should be used to store the manifest file. This is useful when running
804                this code as an API server where multiple requests could be made at the
805                same time. This is set to False when the code is being used from the
806                CLI. Defaults to True.
807
808        Returns:
809            manifest_syn_id (String): Synapse ID of exisiting manifest file.
810            manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True.
811            "" (String): No pre-exisiting manifest in dataset.
812        """
813        manifest_data = ""
814
815        # get a list of files containing the manifest for this dataset (if any)
816        all_files = self.storageFileviewTable
817
818        # construct regex based on manifest basename in the config
819        manifest_re = re.compile(os.path.basename(self.manifest) + ".*.[tc]sv")
820
821        # search manifest based on given manifest basename regex above
822        # and return a dataframe containing name and id of manifests in a given asset view
823        manifest = all_files[
824            (all_files["name"].str.contains(manifest_re, regex=True))
825            & (all_files["parentId"] == datasetId)
826        ]
827
828        manifest = manifest[["id", "name"]]
829
830        # if there is no pre-exisiting manifest in the specified dataset
831        if manifest.empty:
832            logger.warning(
833                f"Could not find a manifest that fits basename {self.manifest} in asset view and dataset {datasetId}"
834            )
835            return ""
836
837        # if there is an exisiting manifest
838        else:
839            manifest_syn_id = self._get_manifest_id(manifest)
840            if downloadFile:
841                md = ManifestDownload(
842                    self.syn,
843                    manifest_id=manifest_syn_id,
844                    synapse_entity_tracker=self.synapse_entity_tracker,
845                )
846                manifest_data = md.download_manifest(
847                    newManifestName=newManifestName,
848                    manifest_df=manifest,
849                    use_temporary_folder=use_temporary_folder,
850                )
851                # TO DO: revisit how downstream code handle manifest_data. If the downstream code would break when manifest_data is an empty string,
852                # then we should catch the error here without returning an empty string.
853                if not manifest_data:
854                    logger.debug(
855                        f"No manifest data returned. Please check if you have successfully downloaded manifest: {manifest_syn_id}"
856                    )
857                return manifest_data
858            return manifest_syn_id

Gets the manifest associated with a given dataset.

Arguments:
  • datasetId: synapse ID of a storage dataset.
  • downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not.
  • newManifestName: new name of a manifest that gets downloaded
  • use_temporary_folder: boolean argument indicating if a temporary folder should be used to store the manifest file. This is useful when running this code as an API server where multiple requests could be made at the same time. This is set to False when the code is being used from the CLI. Defaults to True.
Returns:

manifest_syn_id (String): Synapse ID of exisiting manifest file. manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True. "" (String): No pre-exisiting manifest in dataset.

def getDataTypeFromManifest(self, manifestId: str):
860    def getDataTypeFromManifest(self, manifestId: str):
861        """Fetch a manifest and return data types of all columns
862        Args:
863            manifestId: synapse ID of a manifest
864        """
865        # get manifest file path
866        manifest_entity = self.synapse_entity_tracker.get(
867            synapse_id=manifestId, syn=self.syn, download_file=True
868        )
869        manifest_filepath = manifest_entity.path
870
871        # load manifest dataframe
872        manifest = load_df(
873            manifest_filepath,
874            preserve_raw_input=False,
875            data_model=False,
876        )
877
878        # convert the dataFrame to use best possible dtypes.
879        manifest_new = manifest.convert_dtypes()
880
881        # get data types of columns
882        result = manifest_new.dtypes.to_frame("dtypes").reset_index()
883
884        # return the result as a dictionary
885        result_dict = result.set_index("index")["dtypes"].astype(str).to_dict()
886
887        return result_dict

Fetch a manifest and return data types of all columns

Arguments:
  • manifestId: synapse ID of a manifest
def add_entity_id_and_filename( self, datasetId: str, manifest: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame:
911    def add_entity_id_and_filename(
912        self, datasetId: str, manifest: pd.DataFrame
913    ) -> pd.DataFrame:
914        """add entityid and filename column to an existing manifest assuming entityId column is not already present
915
916        Args:
917            datasetId (str): dataset syn id
918            manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty
919
920        Returns:
921            pd.DataFrame: returns a pandas dataframe
922        """
923        # get file names and entity ids of a given dataset
924        dataset_files_dict = self._get_files_metadata_from_dataset(
925            datasetId, only_new_files=False
926        )
927
928        if dataset_files_dict:
929            # turn manifest dataframe back to a dictionary for operation
930            manifest_dict = manifest.to_dict("list")
931
932            # update Filename column
933            # add entityId column to the end
934            manifest_dict.update(dataset_files_dict)
935
936            # if the component column exists in existing manifest, fill up that column
937            if "Component" in manifest_dict.keys():
938                manifest_dict["Component"] = manifest_dict["Component"] * max(
939                    1, len(manifest_dict["Filename"])
940                )
941
942            # turn dictionary back to a dataframe
943            manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient="index")
944            manifest_df_updated = manifest_df_index.transpose()
945
946            # fill na with empty string
947            manifest_df_updated = manifest_df_updated.fillna("")
948
949            # drop index
950            manifest_df_updated = manifest_df_updated.reset_index(drop=True)
951
952            return manifest_df_updated
953        else:
954            return manifest

add entityid and filename column to an existing manifest assuming entityId column is not already present

Arguments:
  • datasetId (str): dataset syn id
  • manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty
Returns:

pd.DataFrame: returns a pandas dataframe

def fill_in_entity_id_filename( self, datasetId: str, manifest: pandas.core.frame.DataFrame) -> Tuple[List, pandas.core.frame.DataFrame]:
 956    def fill_in_entity_id_filename(
 957        self, datasetId: str, manifest: pd.DataFrame
 958    ) -> Tuple[List, pd.DataFrame]:
 959        """fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present.
 960
 961        Args:
 962            datasetId (str): dataset syn id
 963            manifest (pd.DataFrame): existing manifest dataframe.
 964
 965        Returns:
 966            Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe
 967        """
 968        # get dataset file names and entity id as a list of tuple
 969        dataset_files = self.getFilesInStorageDataset(datasetId)
 970
 971        # update manifest with additional filenames, if any
 972        # note that if there is an existing manifest and there are files in the dataset
 973        # the columns Filename and entityId are assumed to be present in manifest schema
 974        # TODO: use idiomatic panda syntax
 975        if not dataset_files:
 976            manifest = manifest.fillna("")
 977            return dataset_files, manifest
 978
 979        all_files = self._get_file_entityIds(
 980            dataset_files=dataset_files, only_new_files=False, manifest=manifest
 981        )
 982        new_files = self._get_file_entityIds(
 983            dataset_files=dataset_files, only_new_files=True, manifest=manifest
 984        )
 985
 986        all_files = pd.DataFrame(all_files)
 987        new_files = pd.DataFrame(new_files)
 988
 989        # update manifest so that it contains new dataset files
 990        manifest = (
 991            pd.concat([manifest, new_files], sort=False)
 992            .reset_index()
 993            .drop("index", axis=1)
 994        )
 995
 996        # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata
 997        manifest_reindex = manifest.set_index("entityId")
 998        all_files_reindex = all_files.set_index("entityId")
 999        all_files_reindex_like_manifest = all_files_reindex.reindex_like(
1000            manifest_reindex
1001        )
1002
1003        # Check if individual file paths in manifest and from synapse match
1004        file_paths_match = (
1005            manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"]
1006        )
1007
1008        # If all the paths do not match, update the manifest with the filepaths from synapse
1009        if not file_paths_match.all():
1010            manifest_reindex.loc[
1011                ~file_paths_match, "Filename"
1012            ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"]
1013
1014            # reformat manifest for further use
1015            manifest = manifest_reindex.reset_index()
1016            entityIdCol = manifest.pop("entityId")
1017            manifest.insert(len(manifest.columns), "entityId", entityIdCol)
1018
1019        manifest = manifest.fillna("")
1020        return dataset_files, manifest

fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present.

Arguments:
  • datasetId (str): dataset syn id
  • manifest (pd.DataFrame): existing manifest dataframe.
Returns:

Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe

@tracer.start_as_current_span('SynapseStorage::updateDatasetManifestFiles')
def updateDatasetManifestFiles( self, dmge: schematic.schemas.data_model_graph.DataModelGraphExplorer, datasetId: str, store: bool = True) -> Optional[Tuple[str, pandas.core.frame.DataFrame]]:
1022    @tracer.start_as_current_span("SynapseStorage::updateDatasetManifestFiles")
1023    def updateDatasetManifestFiles(
1024        self, dmge: DataModelGraphExplorer, datasetId: str, store: bool = True
1025    ) -> Union[Tuple[str, pd.DataFrame], None]:
1026        """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any.
1027
1028        Args:
1029            dmge: DataModelGraphExplorer Instance
1030            datasetId: synapse ID of a storage dataset.
1031            store: if set to True store updated manifest in asset store; if set to False
1032            return a Pandas dataframe containing updated manifest but do not store to asset store
1033
1034
1035        Returns:
1036            Synapse ID of updated manifest and Pandas dataframe containing the updated manifest.
1037            If there is no existing manifest or if the manifest does not have an entityId column, return None
1038        """
1039
1040        # get existing manifest Synapse ID
1041        manifest_id = self.getDatasetManifest(datasetId)
1042
1043        # if there is no manifest return None
1044        if not manifest_id:
1045            return None
1046
1047        manifest_entity = self.synapse_entity_tracker.get(
1048            synapse_id=manifest_id, syn=self.syn, download_file=True
1049        )
1050        manifest_filepath = manifest_entity.path
1051        manifest = load_df(manifest_filepath)
1052
1053        # If the manifest does not have an entityId column, trigger a new manifest to be generated
1054        if "entityId" not in manifest.columns:
1055            return None
1056
1057        manifest_is_file_based = "Filename" in manifest.columns
1058
1059        if manifest_is_file_based:
1060            # update manifest with additional filenames, if any
1061            # note that if there is an existing manifest and there are files in the dataset
1062            # the columns Filename and entityId are assumed to be present in manifest schema
1063            # TODO: use idiomatic panda syntax
1064            dataset_files, manifest = self.fill_in_entity_id_filename(
1065                datasetId, manifest
1066            )
1067            if dataset_files:
1068                # update the manifest file, so that it contains the relevant entity IDs
1069                if store:
1070                    manifest.to_csv(manifest_filepath, index=False)
1071
1072                    # store manifest and update associated metadata with manifest on Synapse
1073                    manifest_id = self.associateMetadataWithFiles(
1074                        dmge, manifest_filepath, datasetId
1075                    )
1076
1077        return manifest_id, manifest

Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any.

Arguments:
  • dmge: DataModelGraphExplorer Instance
  • datasetId: synapse ID of a storage dataset.
  • store: if set to True store updated manifest in asset store; if set to False
  • return a Pandas dataframe containing updated manifest but do not store to asset store
Returns:

Synapse ID of updated manifest and Pandas dataframe containing the updated manifest. If there is no existing manifest or if the manifest does not have an entityId column, return None

@tracer.start_as_current_span('SynapseStorage::getProjectManifests')
def getProjectManifests( self, projectId: str) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]:
1123    @tracer.start_as_current_span("SynapseStorage::getProjectManifests")
1124    def getProjectManifests(
1125        self, projectId: str
1126    ) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]:
1127        """Gets all metadata manifest files across all datasets in a specified project.
1128
1129        Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest
1130                 as a list of tuples, one for each manifest:
1131                    [
1132                        (
1133                            (datasetId, dataName),
1134                            (manifestId, manifestName),
1135                            (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema
1136                        ),
1137                        ...
1138                    ]
1139
1140        TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface
1141        """
1142        component = None
1143        entity = None
1144        manifests = []
1145
1146        datasets = self.getStorageDatasetsInProject(projectId)
1147
1148        for datasetId, datasetName in datasets:
1149            # encode information about the manifest in a simple list (so that R clients can unpack it)
1150            # eventually can serialize differently
1151
1152            # Get synID of manifest for a dataset
1153            manifestId = self.getDatasetManifest(datasetId)
1154
1155            # If a manifest exists, get the annotations for it, else return base 'manifest' tuple
1156            if manifestId:
1157                annotations = self.getFileAnnotations(manifestId)
1158
1159                # If manifest has annotations specifying component, use that
1160                if annotations and "Component" in annotations:
1161                    component = annotations["Component"]
1162                    entity = self.synapse_entity_tracker.get(
1163                        synapse_id=manifestId, syn=self.syn, download_file=False
1164                    )
1165                    manifest_name = entity["properties"]["name"]
1166
1167                # otherwise download the manifest and parse for information
1168                elif not annotations or "Component" not in annotations:
1169                    logging.debug(
1170                        f"No component annotations have been found for manifest {manifestId}. "
1171                        "The manifest will be downloaded and parsed instead. "
1172                        "For increased speed, add component annotations to manifest."
1173                    )
1174
1175                    manifest_info = self.getDatasetManifest(
1176                        datasetId, downloadFile=True
1177                    )
1178                    manifest_name = manifest_info["properties"].get("name", "")
1179
1180                    if not manifest_name:
1181                        logger.error(f"Failed to download manifests from {datasetId}")
1182
1183                    manifest_path = manifest_info["path"]
1184
1185                    manifest_df = load_df(manifest_path)
1186
1187                    # Get component from component column if it exists
1188                    if (
1189                        "Component" in manifest_df
1190                        and not manifest_df["Component"].empty
1191                    ):
1192                        list(set(manifest_df["Component"]))
1193                        component = list(set(manifest_df["Component"]))
1194
1195                        # Added to address issues raised during DCA testing
1196                        if "" in component:
1197                            component.remove("")
1198
1199                        if len(component) == 1:
1200                            component = component[0]
1201                        elif len(component) > 1:
1202                            logging.warning(
1203                                f"Manifest {manifestId} is composed of multiple components. Schematic does not support mulit-component manifests at this time."
1204                                "Behavior of manifests with multiple components is undefined"
1205                            )
1206            else:
1207                manifest_name = ""
1208                component = None
1209            if component:
1210                manifest = (
1211                    (datasetId, datasetName),
1212                    (manifestId, manifest_name),
1213                    (component, component),
1214                )
1215            elif manifestId:
1216                logging.debug(
1217                    f"Manifest {manifestId} does not have an associated Component"
1218                )
1219                manifest = (
1220                    (datasetId, datasetName),
1221                    (manifestId, manifest_name),
1222                    ("", ""),
1223                )
1224            else:
1225                manifest = (
1226                    (datasetId, datasetName),
1227                    ("", ""),
1228                    ("", ""),
1229                )
1230
1231            if manifest:
1232                manifests.append(manifest)
1233
1234        return manifests

Gets all metadata manifest files across all datasets in a specified project.

Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest as a list of tuples, one for each manifest: [ ( (datasetId, dataName), (manifestId, manifestName), (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema ), ... ]

TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface

def upload_project_manifests_to_synapse( self, dmge: schematic.schemas.data_model_graph.DataModelGraphExplorer, projectId: str) -> List[str]:
1236    def upload_project_manifests_to_synapse(
1237        self, dmge: DataModelGraphExplorer, projectId: str
1238    ) -> List[str]:
1239        """Upload all metadata manifest files across all datasets in a specified project as tables in Synapse.
1240
1241        Returns: String of all the manifest_table_ids of all the manifests that have been loaded.
1242        """
1243
1244        manifests = []
1245        manifest_loaded = []
1246        datasets = self.getStorageDatasetsInProject(projectId)
1247
1248        for datasetId, datasetName in datasets:
1249            # encode information about the manifest in a simple list (so that R clients can unpack it)
1250            # eventually can serialize differently
1251
1252            manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1253
1254            manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1255            if manifest_info:
1256                manifest_id = manifest_info["properties"]["id"]
1257                manifest_name = manifest_info["properties"]["name"]
1258                manifest_path = manifest_info["path"]
1259                manifest_df = load_df(manifest_path)
1260                manifest_table_id = uploadDB(
1261                    dmge=dmge,
1262                    manifest=manifest,
1263                    datasetId=datasetId,
1264                    table_name=datasetName,
1265                )
1266                manifest_loaded.append(datasetName)
1267        return manifest_loaded

Upload all metadata manifest files across all datasets in a specified project as tables in Synapse.

Returns: String of all the manifest_table_ids of all the manifests that have been loaded.

def upload_annotated_project_manifests_to_synapse( self, projectId: str, path_to_json_ld: str, dry_run: bool = False) -> List[str]:
1269    def upload_annotated_project_manifests_to_synapse(
1270        self, projectId: str, path_to_json_ld: str, dry_run: bool = False
1271    ) -> List[str]:
1272        """
1273        Purpose:
1274            For all manifests in a project, upload them as a table and add annotations manifest csv.
1275            Assumes the manifest is already present as a CSV in a dataset in the project.
1276
1277        """
1278        # Instantiate DataModelParser
1279        data_model_parser = DataModelParser(path_to_data_model=path_to_json_ld)
1280        # Parse Model
1281        parsed_data_model = data_model_parser.parse_model()
1282
1283        # Instantiate DataModelGraph
1284        data_model_grapher = DataModelGraph(parsed_data_model)
1285
1286        # Generate graph
1287        graph_data_model = data_model_grapher.generate_data_model_graph()
1288
1289        # Instantiate DataModelGraphExplorer
1290        dmge = DataModelGraphExplorer(graph_data_model)
1291
1292        manifests = []
1293        manifest_loaded = []
1294        datasets = self.getStorageDatasetsInProject(projectId)
1295        for datasetId, datasetName in datasets:
1296            # encode information about the manifest in a simple list (so that R clients can unpack it)
1297            # eventually can serialize differently
1298
1299            manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1300            manifests.append(manifest)
1301
1302            manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1303
1304            if manifest_info:
1305                manifest_id = manifest_info["properties"]["id"]
1306                manifest_name = manifest_info["properties"]["name"]
1307                manifest_path = manifest_info["path"]
1308                manifest = (
1309                    (datasetId, datasetName),
1310                    (manifest_id, manifest_name),
1311                    ("", ""),
1312                )
1313                if not dry_run:
1314                    self.associateMetadataWithFiles(
1315                        dmge, manifest_path, datasetId, manifest_record_type="table"
1316                    )
1317                manifest_loaded.append(manifest)
1318
1319        return manifests, manifest_loaded
Purpose:

For all manifests in a project, upload them as a table and add annotations manifest csv. Assumes the manifest is already present as a CSV in a dataset in the project.

def move_entities_to_new_project( self, projectId: str, newProjectId: str, returnEntities: bool = False, dry_run: bool = False):
1321    def move_entities_to_new_project(
1322        self,
1323        projectId: str,
1324        newProjectId: str,
1325        returnEntities: bool = False,
1326        dry_run: bool = False,
1327    ):
1328        """
1329        For each manifest csv in a project, look for all the entitiy ids that are associated.
1330        Look up the entitiy in the files, move the entity to new project.
1331        """
1332
1333        manifests = []
1334        manifest_loaded = []
1335        datasets = self.getStorageDatasetsInProject(projectId)
1336        if datasets:
1337            for datasetId, datasetName in datasets:
1338                # encode information about the manifest in a simple list (so that R clients can unpack it)
1339                # eventually can serialize differently
1340
1341                manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1342                manifests.append(manifest)
1343
1344                manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1345                if manifest_info:
1346                    manifest_id = manifest_info["properties"]["id"]
1347                    manifest_name = manifest_info["properties"]["name"]
1348                    manifest_path = manifest_info["path"]
1349                    manifest_df = load_df(manifest_path)
1350
1351                    manifest = (
1352                        (datasetId, datasetName),
1353                        (manifest_id, manifest_name),
1354                        ("", ""),
1355                    )
1356                    manifest_loaded.append(manifest)
1357
1358                    annotation_entities = self.storageFileviewTable[
1359                        (self.storageFileviewTable["id"].isin(manifest_df["entityId"]))
1360                        & (self.storageFileviewTable["type"] == "folder")
1361                    ]["id"]
1362
1363                    if returnEntities:
1364                        for entityId in annotation_entities:
1365                            if not dry_run:
1366                                moved_entity = self.syn.move(entityId, datasetId)
1367                                self.synapse_entity_tracker.add(
1368                                    synapse_id=moved_entity.id, entity=moved_entity
1369                                )
1370                            else:
1371                                logging.info(
1372                                    f"{entityId} will be moved to folder {datasetId}."
1373                                )
1374                    else:
1375                        # generate project folder
1376                        archive_project_folder = Folder(
1377                            projectId + "_archive", parent=newProjectId
1378                        )
1379                        archive_project_folder = self.syn.store(archive_project_folder)
1380                        self.synapse_entity_tracker.add(
1381                            synapse_id=archive_project_folder.id,
1382                            entity=archive_project_folder,
1383                        )
1384
1385                        # generate dataset folder
1386                        dataset_archive_folder = Folder(
1387                            "_".join([datasetId, datasetName, "archive"]),
1388                            parent=archive_project_folder.id,
1389                        )
1390                        dataset_archive_folder = self.syn.store(dataset_archive_folder)
1391                        self.synapse_entity_tracker.add(
1392                            synapse_id=dataset_archive_folder.id,
1393                            entity=dataset_archive_folder,
1394                        )
1395
1396                        for entityId in annotation_entities:
1397                            # move entities to folder
1398                            if not dry_run:
1399                                moved_entity = self.syn.move(
1400                                    entityId, dataset_archive_folder.id
1401                                )
1402                                self.synapse_entity_tracker.add(
1403                                    synapse_id=moved_entity.id, entity=moved_entity
1404                                )
1405                            else:
1406                                logging.info(
1407                                    f"{entityId} will be moved to folder {dataset_archive_folder.id}."
1408                                )
1409        else:
1410            raise LookupError(
1411                f"No datasets were found in the specified project: {projectId}. Re-check specified master_fileview in CONFIG and retry."
1412            )
1413        return manifests, manifest_loaded

For each manifest csv in a project, look for all the entitiy ids that are associated. Look up the entitiy in the files, move the entity to new project.

@tracer.start_as_current_span('SynapseStorage::get_synapse_table')
def get_synapse_table( self, synapse_id: str) -> Tuple[pandas.core.frame.DataFrame, synapseclient.table.CsvFileTable]:
1415    @tracer.start_as_current_span("SynapseStorage::get_synapse_table")
1416    def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable]:
1417        """Download synapse table as a pd dataframe; return table schema and etags as results too
1418
1419        Args:
1420            synapse_id: synapse ID of the table to query
1421        """
1422
1423        results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id))
1424        df = results.asDataFrame(
1425            rowIdAndVersionInIndex=False,
1426            na_values=STR_NA_VALUES_FILTERED,
1427            keep_default_na=False,
1428        )
1429
1430        return df, results

Download synapse table as a pd dataframe; return table schema and etags as results too

Arguments:
  • synapse_id: synapse ID of the table to query
def uploadDB(*args, **kwargs):
537        def wrapper(*args, **kwargs):
538            try:
539                return method(*args, **kwargs)
540            except SynapseHTTPError as ex:
541                str_message = str(ex).replace("\n", "")
542                if "trash" in str_message or "does not exist" in str_message:
543                    logging.warning(str_message)
544                    return None
545                else:
546                    raise ex

Method to upload a database to an asset store. In synapse, this will upload a metadata table

Arguments:
  • dmge: DataModelGraphExplorer object
  • manifest: pd.Df manifest to upload
  • datasetId: synID of the dataset for the manifest
  • table_name: name of the table to be uploaded
  • restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
  • existingTableId: str of the synId of the existing table, if one already exists
  • table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
  • table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
Returns:

manifest_table_id: synID of the uploaded table manifest: the original manifset table_manifest: manifest formatted appropriately for the table

@tracer.start_as_current_span('SynapseStorage::formatDB')
def formatDB(self, dmge, manifest, table_column_names):
1481    @tracer.start_as_current_span("SynapseStorage::formatDB")
1482    def formatDB(self, dmge, manifest, table_column_names):
1483        """
1484        Method to format a manifest appropriatly for upload as table
1485
1486        Args:
1487            dmge: DataModelGraphExplorer object
1488            manifest: pd.Df manifest to upload
1489            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
1490                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
1491                display label formatting.
1492        Returns:
1493            col_schema: schema for table columns: type, size, etc
1494            table_manifest: formatted manifest
1495
1496        """
1497        # Rename the manifest columns to display names to match fileview
1498
1499        blacklist_chars = ["(", ")", ".", " ", "-"]
1500        manifest_columns = manifest.columns.tolist()
1501
1502        table_manifest = deepcopy(manifest)
1503
1504        if table_column_names == "display_name":
1505            cols = table_manifest.columns
1506
1507        elif table_column_names == "display_label":
1508            cols = [
1509                str(col).translate({ord(x): "" for x in blacklist_chars})
1510                for col in manifest_columns
1511            ]
1512
1513        elif table_column_names == "class_label":
1514            cols = [
1515                get_class_label_from_display_name(str(col)).translate(
1516                    {ord(x): "" for x in blacklist_chars}
1517                )
1518                for col in manifest_columns
1519            ]
1520        else:
1521            ValueError(
1522                f"The provided table_column_name: {table_column_names} is not valid, please resubmit with an allowed value only."
1523            )
1524
1525        cols = list(map(lambda x: x.replace("EntityId", "entityId"), cols))
1526
1527        # Reset column names in table manifest
1528        table_manifest.columns = cols
1529
1530        # move entity id to end of df
1531        entity_col = table_manifest.pop("entityId")
1532        table_manifest.insert(len(table_manifest.columns), "entityId", entity_col)
1533
1534        # Get the column schema
1535        col_schema = as_table_columns(table_manifest)
1536
1537        # Set Id column length to 64 (for some reason not being auto set.)
1538        for i, col in enumerate(col_schema):
1539            if col["name"].lower() == "id":
1540                col_schema[i]["maximumSize"] = 64
1541
1542        return col_schema, table_manifest

Method to format a manifest appropriatly for upload as table

Arguments:
  • dmge: DataModelGraphExplorer object
  • manifest: pd.Df manifest to upload
  • table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
Returns:

col_schema: schema for table columns: type, size, etc table_manifest: formatted manifest

@tracer.start_as_current_span('SynapseStorage::buildDB')
def buildDB( self, datasetId: str, table_name: str, col_schema: List, table_manifest: pandas.core.frame.DataFrame, table_manipulation: str, dmge: schematic.schemas.data_model_graph.DataModelGraphExplorer, restrict: bool = False):
1544    @tracer.start_as_current_span("SynapseStorage::buildDB")
1545    def buildDB(
1546        self,
1547        datasetId: str,
1548        table_name: str,
1549        col_schema: List,
1550        table_manifest: pd.DataFrame,
1551        table_manipulation: str,
1552        dmge: DataModelGraphExplorer,
1553        restrict: bool = False,
1554    ):
1555        """
1556        Method to construct the table appropriately: create new table, replace existing, or upsert new into existing
1557        Calls TableOperations class to execute
1558
1559        Args:
1560            datasetId: synID of the dataset for the manifest
1561            table_name: name of the table to be uploaded
1562            col_schema: schema for table columns: type, size, etc from `formatDB`
1563            table_manifest: formatted manifest that can be uploaded as a table
1564            table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
1565            restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
1566
1567        Returns:
1568            manifest_table_id: synID of the uploaded table
1569
1570        """
1571        table_parent_id = self.getDatasetProject(datasetId=datasetId)
1572        existing_table_id = self.syn.findEntityId(
1573            name=table_name, parent=table_parent_id
1574        )
1575
1576        tableOps = TableOperations(
1577            synStore=self,
1578            tableToLoad=table_manifest,
1579            tableName=table_name,
1580            datasetId=datasetId,
1581            existingTableId=existing_table_id,
1582            restrict=restrict,
1583            synapse_entity_tracker=self.synapse_entity_tracker,
1584        )
1585
1586        if not table_manipulation or existing_table_id is None:
1587            manifest_table_id = tableOps.createTable(
1588                columnTypeDict=col_schema,
1589                specifySchema=True,
1590            )
1591        elif existing_table_id is not None:
1592            if table_manipulation.lower() == "replace":
1593                manifest_table_id = tableOps.replaceTable(
1594                    specifySchema=True,
1595                    columnTypeDict=col_schema,
1596                )
1597            elif table_manipulation.lower() == "upsert":
1598                manifest_table_id = tableOps.upsertTable(
1599                    dmge=dmge,
1600                )
1601            elif table_manipulation.lower() == "update":
1602                manifest_table_id = tableOps.updateTable()
1603
1604        if table_manipulation and table_manipulation.lower() == "upsert":
1605            table_entity = self.synapse_entity_tracker.get(
1606                synapse_id=existing_table_id or manifest_table_id,
1607                syn=self.syn,
1608                download_file=False,
1609            )
1610            annos = OldAnnotations(
1611                id=table_entity.id,
1612                etag=table_entity.etag,
1613                values=table_entity.annotations,
1614            )
1615            annos["primary_key"] = table_manifest["Component"][0] + "_id"
1616            annos = self.syn.set_annotations(annos)
1617            table_entity.etag = annos.etag
1618            table_entity.annotations = annos
1619
1620        return manifest_table_id

Method to construct the table appropriately: create new table, replace existing, or upsert new into existing Calls TableOperations class to execute

Arguments:
  • datasetId: synID of the dataset for the manifest
  • table_name: name of the table to be uploaded
  • col_schema: schema for table columns: type, size, etc from formatDB
  • table_manifest: formatted manifest that can be uploaded as a table
  • table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
  • restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
Returns:

manifest_table_id: synID of the uploaded table

@tracer.start_as_current_span('SynapseStorage::upload_manifest_file')
def upload_manifest_file( self, manifest, metadataManifestPath, datasetId, restrict_manifest, component_name=''):
1622    @tracer.start_as_current_span("SynapseStorage::upload_manifest_file")
1623    def upload_manifest_file(
1624        self,
1625        manifest,
1626        metadataManifestPath,
1627        datasetId,
1628        restrict_manifest,
1629        component_name="",
1630    ):
1631        # Update manifest to have the new entityId column
1632        manifest.to_csv(metadataManifestPath, index=False)
1633
1634        # store manifest to Synapse as a CSV
1635        # update file name
1636        file_name_full = metadataManifestPath.split("/")[-1]
1637        file_extension = file_name_full.split(".")[-1]
1638
1639        # Differentiate "censored" and "uncensored" manifest
1640        if "censored" in file_name_full:
1641            file_name_new = (
1642                os.path.basename(CONFIG.synapse_manifest_basename)
1643                + "_"
1644                + component_name
1645                + "_censored"
1646                + "."
1647                + file_extension
1648            )
1649        else:
1650            file_name_new = (
1651                os.path.basename(CONFIG.synapse_manifest_basename)
1652                + "_"
1653                + component_name
1654                + "."
1655                + file_extension
1656            )
1657
1658        manifest_synapse_file = None
1659        try:
1660            # Rename the file to file_name_new then revert
1661            # This is to maintain the original file name in-case other code is
1662            # expecting that the file exists with the original name
1663            original_file_path = metadataManifestPath
1664            new_file_path = os.path.join(
1665                os.path.dirname(metadataManifestPath), file_name_new
1666            )
1667            os.rename(original_file_path, new_file_path)
1668
1669            manifest_synapse_file = self._store_file_for_manifest_upload(
1670                new_file_path=new_file_path,
1671                dataset_id=datasetId,
1672                existing_file_name=file_name_full,
1673                file_name_new=file_name_new,
1674                restrict_manifest=restrict_manifest,
1675            )
1676            manifest_synapse_file_id = manifest_synapse_file.id
1677
1678        finally:
1679            # Revert the file name back to the original
1680            os.rename(new_file_path, original_file_path)
1681
1682            if manifest_synapse_file:
1683                manifest_synapse_file.path = original_file_path
1684
1685        return manifest_synapse_file_id
async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]:
1742    async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]:
1743        """get annotations asynchronously
1744
1745        Args:
1746            synapse_id (str): synapse id of the entity that the annotation belongs
1747
1748        Returns:
1749            Dict[str, Any]: The requested entity bundle matching
1750            <https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html>
1751        """
1752        return await get_entity_id_bundle2(
1753            entity_id=synapse_id,
1754            request={"includeAnnotations": True},
1755            synapse_client=self.syn,
1756        )

get annotations asynchronously

Arguments:
  • synapse_id (str): synapse id of the entity that the annotation belongs
Returns:

Dict[str, Any]: The requested entity bundle matching https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html

async def store_async_annotation( self, annotation_dict: dict) -> synapseclient.models.annotations.Annotations:
1758    async def store_async_annotation(self, annotation_dict: dict) -> Annotations:
1759        """store annotation in an async way
1760
1761        Args:
1762            annotation_dict (dict): annotation in a dictionary format
1763
1764        Returns:
1765            Annotations: The stored annotations.
1766        """
1767        annotation_data = Annotations.from_dict(
1768            synapse_annotations=annotation_dict["annotations"]["annotations"]
1769        )
1770        annotation_class = Annotations(
1771            annotations=annotation_data,
1772            etag=annotation_dict["annotations"]["etag"],
1773            id=annotation_dict["annotations"]["id"],
1774        )
1775        annotation_storage_result = await annotation_class.store_async(
1776            synapse_client=self.syn
1777        )
1778        local_entity = self.synapse_entity_tracker.get(
1779            synapse_id=annotation_dict["annotations"]["id"],
1780            syn=self.syn,
1781            download_file=False,
1782            retrieve_if_not_present=False,
1783        )
1784        if local_entity:
1785            local_entity.etag = annotation_storage_result.etag
1786            local_entity.annotations = annotation_storage_result
1787        return annotation_storage_result

store annotation in an async way

Arguments:
  • annotation_dict (dict): annotation in a dictionary format
Returns:

Annotations: The stored annotations.

def process_row_annotations( self, dmge: schematic.schemas.data_model_graph.DataModelGraphExplorer, metadata_syn: Dict[str, Any], hide_blanks: bool, csv_list_regex: str, annos: Dict[str, Any], annotation_keys: str) -> Dict[str, Any]:
1789    def process_row_annotations(
1790        self,
1791        dmge: DataModelGraphExplorer,
1792        metadata_syn: Dict[str, Any],
1793        hide_blanks: bool,
1794        csv_list_regex: str,
1795        annos: Dict[str, Any],
1796        annotation_keys: str,
1797    ) -> Dict[str, Any]:
1798        """Processes metadata annotations based on the logic below:
1799        1. Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is:
1800            An empty or whitespace-only string.
1801            A NaN value (if the annotation is a float).
1802        if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key.
1803        if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key.
1804
1805        2. If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name".
1806        Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key.
1807
1808        3. For any other conditions, assigns the original value of anno_v to the annotation key (anno_k).
1809
1810        4. Returns the updated annotations dictionary.
1811
1812        Args:
1813            dmge (DataModelGraphExplorer): data model graph explorer
1814            metadata_syn (dict): metadata used for Synapse storage
1815            hideBlanks (bool): if true, does not upload annotation keys with blank values.
1816            csv_list_regex (str): Regex to match with comma separated list
1817            annos (Dict[str, Any]): dictionary of annotation returned from synapse
1818            annotation_keys (str): display_label/class_label
1819
1820        Returns:
1821            Dict[str, Any]: annotations as a dictionary
1822
1823        ```mermaid
1824        flowchart TD
1825            A[Start] --> C{Is anno_v empty, whitespace, or NaN?}
1826            C -- Yes --> D{Is hide_blanks True?}
1827            D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing]
1828            D -- No --> F[Assign empty string to annotation key]
1829            C -- No --> G{Is anno_v a string?}
1830            G -- No --> H[Assign original value of anno_v to annotation key]
1831            G -- Yes --> I{Does anno_v match csv_list_regex?}
1832            I -- Yes --> J[Get validation rule of anno_k]
1833            J --> K{Does the validation rule contain 'list'}
1834            K -- Yes --> L[Split anno_v by commas and assign as list]
1835            I -- No --> H
1836            K -- No --> H
1837        ```
1838        """
1839        for anno_k, anno_v in metadata_syn.items():
1840            # Remove keys with nan or empty string values or string that only contains white space from dict of annotations to be uploaded
1841            # if present on current data annotation
1842            if hide_blanks and (
1843                (isinstance(anno_v, str) and anno_v.strip() == "")
1844                or (isinstance(anno_v, float) and np.isnan(anno_v))
1845            ):
1846                annos["annotations"]["annotations"].pop(anno_k) if anno_k in annos[
1847                    "annotations"
1848                ]["annotations"].keys() else annos["annotations"]["annotations"]
1849                continue
1850
1851            # Otherwise save annotation as approrpriate
1852            if isinstance(anno_v, float) and np.isnan(anno_v):
1853                annos["annotations"]["annotations"][anno_k] = ""
1854                continue
1855
1856            # Handle strings that match the csv_list_regex and pass the validation rule
1857            if isinstance(anno_v, str) and re.fullmatch(csv_list_regex, anno_v):
1858                # Use a dictionary to dynamically choose the argument
1859                param = (
1860                    {"node_display_name": anno_k}
1861                    if annotation_keys == "display_label"
1862                    else {"node_label": anno_k}
1863                )
1864                node_validation_rules = dmge.get_node_validation_rules(**param)
1865
1866                if rule_in_rule_list("list", node_validation_rules):
1867                    annos["annotations"]["annotations"][anno_k] = anno_v.split(",")
1868                    continue
1869            # default: assign the original value
1870            annos["annotations"]["annotations"][anno_k] = anno_v
1871
1872        return annos

Processes metadata annotations based on the logic below:

  1. Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is: An empty or whitespace-only string. A NaN value (if the annotation is a float). if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key. if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key.
  1. If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name". Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key.

  2. For any other conditions, assigns the original value of anno_v to the annotation key (anno_k).

  3. Returns the updated annotations dictionary.

Arguments:
  • dmge (DataModelGraphExplorer): data model graph explorer
  • metadata_syn (dict): metadata used for Synapse storage
  • hideBlanks (bool): if true, does not upload annotation keys with blank values.
  • csv_list_regex (str): Regex to match with comma separated list
  • annos (Dict[str, Any]): dictionary of annotation returned from synapse
  • annotation_keys (str): display_label/class_label
Returns:

Dict[str, Any]: annotations as a dictionary

flowchart TD A[Start] --> C{Is anno_v empty, whitespace, or NaN?} C -- Yes --> D{Is hide_blanks True?} D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing] D -- No --> F[Assign empty string to annotation key] C -- No --> G{Is anno_v a string?} G -- No --> H[Assign original value of anno_v to annotation key] G -- Yes --> I{Does anno_v match csv_list_regex?} I -- Yes --> J[Get validation rule of anno_k] J --> K{Does the validation rule contain 'list'} K -- Yes --> L[Split anno_v by commas and assign as list] I -- No --> H K -- No --> H
async def format_row_annotations(*args: Any, **kwargs: Any) -> Any:
553        async def wrapper(*args: Any, **kwargs: Any) -> Any:
554            try:
555                return await method(*args, **kwargs)
556            except SynapseHTTPError as ex:
557                str_message = str(ex).replace("\n", "")
558                if "trash" in str_message or "does not exist" in str_message:
559                    logging.warning(str_message)
560                    return None
561                else:
562                    raise ex
def format_manifest_annotations(*args, **kwargs):
537        def wrapper(*args, **kwargs):
538            try:
539                return method(*args, **kwargs)
540            except SynapseHTTPError as ex:
541                str_message = str(ex).replace("\n", "")
542                if "trash" in str_message or "does not exist" in str_message:
543                    logging.warning(str_message)
544                    return None
545                else:
546                    raise ex

Set annotations for the manifest (as a whole) so they can be applied to the manifest table or csv. For now just getting the Component.

@tracer.start_as_current_span('SynapseStorage::add_annotations_to_entities_files')
async def add_annotations_to_entities_files( self, dmge, manifest, manifest_record_type: str, datasetId: str, hideBlanks: bool, manifest_synapse_table_id='', annotation_keys: str = 'class_label'):
2234    @tracer.start_as_current_span("SynapseStorage::add_annotations_to_entities_files")
2235    async def add_annotations_to_entities_files(
2236        self,
2237        dmge,
2238        manifest,
2239        manifest_record_type: str,
2240        datasetId: str,
2241        hideBlanks: bool,
2242        manifest_synapse_table_id="",
2243        annotation_keys: str = "class_label",
2244    ):
2245        """
2246        Depending on upload type add Ids to entityId row. Add anotations to connected
2247        files and folders. Despite the name of this function, it also applies to folders.
2248
2249        Args:
2250            dmge: DataModelGraphExplorer Object
2251            manifest (pd.DataFrame): loaded df containing user supplied data.
2252            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2253            datasetId (str): synapse ID of folder containing the dataset
2254            hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2255            manifest_synapse_table_id (str): Default is an empty string ''.
2256            annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display
2257                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2258                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2259        Returns:
2260            manifest (pd.DataFrame): modified to add entitiyId as appropriate
2261
2262        """
2263
2264        # Expected behavior is to annotate files if `Filename` is present and if file_annotations_upload is set to True regardless of `-mrt` setting
2265        if "filename" in [col.lower() for col in manifest.columns]:
2266            # get current list of files and store as dataframe
2267            dataset_files = self.getFilesInStorageDataset(datasetId)
2268            files_and_entityIds = self._get_file_entityIds(
2269                dataset_files=dataset_files, only_new_files=False
2270            )
2271            file_df = pd.DataFrame(files_and_entityIds)
2272
2273            # Merge dataframes to add entityIds
2274            manifest = manifest.merge(
2275                file_df, how="left", on="Filename", suffixes=["_x", None]
2276            ).drop("entityId_x", axis=1)
2277
2278        # Fill `entityId` for each row if missing and annotate entity as appropriate
2279        requests = set()
2280        for idx, row in manifest.iterrows():
2281            if not row["entityId"] and (
2282                manifest_record_type == "file_and_entities"
2283                or manifest_record_type == "table_file_and_entities"
2284            ):
2285                manifest, entityId = self._create_entity_id(
2286                    idx, row, manifest, datasetId
2287                )
2288            elif not row["entityId"] and manifest_record_type == "table_and_file":
2289                # If not using entityIds, fill with manifest_table_id so
2290                row["entityId"] = manifest_synapse_table_id
2291                manifest.loc[idx, "entityId"] = manifest_synapse_table_id
2292                entityId = ""
2293                # If the row is the manifest table, do not add annotations
2294            elif row["entityId"] == manifest_synapse_table_id:
2295                entityId = ""
2296            else:
2297                # get the file id of the file to annotate, collected in above step.
2298                entityId = row["entityId"]
2299
2300            # Adding annotations to connected files.
2301            if entityId:
2302                # Format annotations for Synapse
2303                annos_task = asyncio.create_task(
2304                    self.format_row_annotations(
2305                        dmge, row, entityId, hideBlanks, annotation_keys
2306                    )
2307                )
2308                requests.add(annos_task)
2309        await self._process_store_annos(requests)
2310        return manifest

Depending on upload type add Ids to entityId row. Add anotations to connected files and folders. Despite the name of this function, it also applies to folders.

Arguments:
  • dmge: DataModelGraphExplorer Object
  • manifest (pd.DataFrame): loaded df containing user supplied data.
  • manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
  • datasetId (str): synapse ID of folder containing the dataset
  • hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
  • manifest_synapse_table_id (str): Default is an empty string ''.
  • annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
Returns:

manifest (pd.DataFrame): modified to add entitiyId as appropriate

@tracer.start_as_current_span('SynapseStorage::upload_manifest_as_table')
def upload_manifest_as_table( self, dmge: schematic.schemas.data_model_graph.DataModelGraphExplorer, manifest: pandas.core.frame.DataFrame, metadataManifestPath: str, datasetId: str, table_name: str, component_name: str, restrict: bool, manifest_record_type: str, hideBlanks: bool, table_manipulation: str, table_column_names: str, annotation_keys: str, file_annotations_upload: bool = True):
2312    @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_table")
2313    def upload_manifest_as_table(
2314        self,
2315        dmge: DataModelGraphExplorer,
2316        manifest: pd.DataFrame,
2317        metadataManifestPath: str,
2318        datasetId: str,
2319        table_name: str,
2320        component_name: str,
2321        restrict: bool,
2322        manifest_record_type: str,
2323        hideBlanks: bool,
2324        table_manipulation: str,
2325        table_column_names: str,
2326        annotation_keys: str,
2327        file_annotations_upload: bool = True,
2328    ):
2329        """Upload manifest to Synapse as a table and csv.
2330        Args:
2331            dmge: DataModelGraphExplorer object
2332            manifest (pd.DataFrame): loaded df containing user supplied data.
2333            metadataManifestPath: path to csv containing a validated metadata manifest.
2334            datasetId (str): synapse ID of folder containing the dataset
2335            table_name (str): Generated to name the table being uploaded.
2336            component_name (str): Name of the component manifest that is currently being uploaded.
2337            restrict (bool): Flag for censored data.
2338            manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2339            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2340            table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2341            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2342                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2343                display label formatting.
2344            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2345                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2346                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2347            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2348        Return:
2349            manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2350        """
2351        # Upload manifest as a table, get the ID and updated manifest.
2352        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2353            dmge=dmge,
2354            manifest=manifest,
2355            datasetId=datasetId,
2356            table_name=table_name,
2357            restrict=restrict,
2358            table_manipulation=table_manipulation,
2359            table_column_names=table_column_names,
2360        )
2361
2362        if file_annotations_upload:
2363            manifest = asyncio.run(
2364                self.add_annotations_to_entities_files(
2365                    dmge,
2366                    manifest,
2367                    manifest_record_type,
2368                    datasetId,
2369                    hideBlanks,
2370                    manifest_synapse_table_id,
2371                    annotation_keys,
2372                )
2373            )
2374        # Load manifest to synapse as a CSV File
2375        manifest_synapse_file_id = self.upload_manifest_file(
2376            manifest=manifest,
2377            metadataManifestPath=metadataManifestPath,
2378            datasetId=datasetId,
2379            restrict_manifest=restrict,
2380            component_name=component_name,
2381        )
2382
2383        # Set annotations for the file manifest.
2384        manifest_annotations = self.format_manifest_annotations(
2385            manifest=manifest, manifest_synapse_id=manifest_synapse_file_id
2386        )
2387        annos = self.syn.set_annotations(annotations=manifest_annotations)
2388        manifest_entity = self.synapse_entity_tracker.get(
2389            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2390        )
2391        manifest_entity.annotations = annos
2392        manifest_entity.etag = annos.etag
2393
2394        logger.info("Associated manifest file with dataset on Synapse.")
2395
2396        # Update manifest Synapse table with new entity id column.
2397        manifest_synapse_table_id, manifest, _ = self.uploadDB(
2398            dmge=dmge,
2399            manifest=manifest,
2400            datasetId=datasetId,
2401            table_name=table_name,
2402            restrict=restrict,
2403            table_manipulation="update",
2404            table_column_names=table_column_names,
2405        )
2406
2407        # Set annotations for the table manifest
2408        manifest_annotations = self.format_manifest_annotations(
2409            manifest=manifest, manifest_synapse_id=manifest_synapse_table_id
2410        )
2411        annotations_manifest_table = self.syn.set_annotations(
2412            annotations=manifest_annotations
2413        )
2414        manifest_table_entity = self.synapse_entity_tracker.get(
2415            synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False
2416        )
2417        manifest_table_entity.annotations = annotations_manifest_table
2418        manifest_table_entity.etag = annotations_manifest_table.etag
2419
2420        return manifest_synapse_file_id

Upload manifest to Synapse as a table and csv.

Arguments:
  • dmge: DataModelGraphExplorer object
  • manifest (pd.DataFrame): loaded df containing user supplied data.
  • metadataManifestPath: path to csv containing a validated metadata manifest.
  • datasetId (str): synapse ID of folder containing the dataset
  • table_name (str): Generated to name the table being uploaded.
  • component_name (str): Name of the component manifest that is currently being uploaded.
  • restrict (bool): Flag for censored data.
  • manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
  • hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
  • table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
  • table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
  • annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
  • file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
Return:

manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.

@tracer.start_as_current_span('SynapseStorage::upload_manifest_as_csv')
def upload_manifest_as_csv( self, dmge, manifest, metadataManifestPath, datasetId, restrict, manifest_record_type, hideBlanks, component_name, annotation_keys: str, file_annotations_upload: bool = True):
2422    @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_csv")
2423    def upload_manifest_as_csv(
2424        self,
2425        dmge,
2426        manifest,
2427        metadataManifestPath,
2428        datasetId,
2429        restrict,
2430        manifest_record_type,
2431        hideBlanks,
2432        component_name,
2433        annotation_keys: str,
2434        file_annotations_upload: bool = True,
2435    ):
2436        """Upload manifest to Synapse as a csv only.
2437        Args:
2438            dmge: DataModelGraphExplorer object
2439            manifest (pd.DataFrame): loaded df containing user supplied data.
2440            metadataManifestPath: path to csv containing a validated metadata manifest.
2441            datasetId (str): synapse ID of folder containing the dataset
2442            restrict (bool): Flag for censored data.
2443            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2444            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2445            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2446                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2447                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2448            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2449        Return:
2450            manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2451        """
2452        if file_annotations_upload:
2453            manifest = asyncio.run(
2454                self.add_annotations_to_entities_files(
2455                    dmge,
2456                    manifest,
2457                    manifest_record_type,
2458                    datasetId,
2459                    hideBlanks,
2460                    annotation_keys=annotation_keys,
2461                )
2462            )
2463
2464        # Load manifest to synapse as a CSV File
2465        manifest_synapse_file_id = self.upload_manifest_file(
2466            manifest,
2467            metadataManifestPath,
2468            datasetId,
2469            restrict,
2470            component_name=component_name,
2471        )
2472
2473        # Set annotations for the file manifest.
2474        manifest_annotations = self.format_manifest_annotations(
2475            manifest, manifest_synapse_file_id
2476        )
2477        annos = self.syn.set_annotations(manifest_annotations)
2478        manifest_entity = self.synapse_entity_tracker.get(
2479            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2480        )
2481        manifest_entity.annotations = annos
2482        manifest_entity.etag = annos.etag
2483
2484        logger.info("Associated manifest file with dataset on Synapse.")
2485
2486        return manifest_synapse_file_id

Upload manifest to Synapse as a csv only.

Arguments:
  • dmge: DataModelGraphExplorer object
  • manifest (pd.DataFrame): loaded df containing user supplied data.
  • metadataManifestPath: path to csv containing a validated metadata manifest.
  • datasetId (str): synapse ID of folder containing the dataset
  • restrict (bool): Flag for censored data.
  • manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
  • hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
  • annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
  • file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
Return:

manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.

@tracer.start_as_current_span('SynapseStorage::upload_manifest_combo')
def upload_manifest_combo( self, dmge, manifest, metadataManifestPath, datasetId, table_name, component_name, restrict, manifest_record_type, hideBlanks, table_manipulation, table_column_names: str, annotation_keys: str, file_annotations_upload: bool = True):
2488    @tracer.start_as_current_span("SynapseStorage::upload_manifest_combo")
2489    def upload_manifest_combo(
2490        self,
2491        dmge,
2492        manifest,
2493        metadataManifestPath,
2494        datasetId,
2495        table_name,
2496        component_name,
2497        restrict,
2498        manifest_record_type,
2499        hideBlanks,
2500        table_manipulation,
2501        table_column_names: str,
2502        annotation_keys: str,
2503        file_annotations_upload: bool = True,
2504    ):
2505        """Upload manifest to Synapse as a table and CSV with entities.
2506        Args:
2507            dmge: DataModelGraphExplorer object
2508            manifest (pd.DataFrame): loaded df containing user supplied data.
2509            metadataManifestPath: path to csv containing a validated metadata manifest.
2510            datasetId (str): synapse ID of folder containing the dataset
2511            table_name (str): Generated to name the table being uploaded.
2512            component_name (str): Name of the component manifest that is currently being uploaded.
2513            restrict (bool): Flag for censored data.
2514            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2515            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2516            table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2517            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2518                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2519                display label formatting.
2520            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2521                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2522                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2523            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2524        Return:
2525            manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2526        """
2527        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2528            dmge=dmge,
2529            manifest=manifest,
2530            datasetId=datasetId,
2531            table_name=table_name,
2532            restrict=restrict,
2533            table_manipulation=table_manipulation,
2534            table_column_names=table_column_names,
2535        )
2536
2537        if file_annotations_upload:
2538            manifest = asyncio.run(
2539                self.add_annotations_to_entities_files(
2540                    dmge,
2541                    manifest,
2542                    manifest_record_type,
2543                    datasetId,
2544                    hideBlanks,
2545                    manifest_synapse_table_id,
2546                    annotation_keys=annotation_keys,
2547                )
2548            )
2549
2550        # Load manifest to synapse as a CSV File
2551        manifest_synapse_file_id = self.upload_manifest_file(
2552            manifest, metadataManifestPath, datasetId, restrict, component_name
2553        )
2554
2555        # Set annotations for the file manifest.
2556        manifest_annotations = self.format_manifest_annotations(
2557            manifest, manifest_synapse_file_id
2558        )
2559        file_manifest_annoations = self.syn.set_annotations(manifest_annotations)
2560        manifest_entity = self.synapse_entity_tracker.get(
2561            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2562        )
2563        manifest_entity.annotations = file_manifest_annoations
2564        manifest_entity.etag = file_manifest_annoations.etag
2565        logger.info("Associated manifest file with dataset on Synapse.")
2566
2567        # Update manifest Synapse table with new entity id column.
2568        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2569            dmge=dmge,
2570            manifest=manifest,
2571            datasetId=datasetId,
2572            table_name=table_name,
2573            restrict=restrict,
2574            table_manipulation="update",
2575            table_column_names=table_column_names,
2576        )
2577
2578        # Set annotations for the table manifest
2579        manifest_annotations = self.format_manifest_annotations(
2580            manifest, manifest_synapse_table_id
2581        )
2582        table_manifest_annotations = self.syn.set_annotations(manifest_annotations)
2583        manifest_entity = self.synapse_entity_tracker.get(
2584            synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False
2585        )
2586        manifest_entity.annotations = table_manifest_annotations
2587        manifest_entity.etag = table_manifest_annotations.etag
2588        return manifest_synapse_file_id

Upload manifest to Synapse as a table and CSV with entities.

Arguments:
  • dmge: DataModelGraphExplorer object
  • manifest (pd.DataFrame): loaded df containing user supplied data.
  • metadataManifestPath: path to csv containing a validated metadata manifest.
  • datasetId (str): synapse ID of folder containing the dataset
  • table_name (str): Generated to name the table being uploaded.
  • component_name (str): Name of the component manifest that is currently being uploaded.
  • restrict (bool): Flag for censored data.
  • manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
  • hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
  • table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
  • table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
  • annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
  • file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
Return:

manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.

@tracer.start_as_current_span('SynapseStorage::associateMetadataWithFiles')
def associateMetadataWithFiles( self, dmge: schematic.schemas.data_model_graph.DataModelGraphExplorer, metadataManifestPath: str, datasetId: str, manifest_record_type: str = 'table_file_and_entities', hideBlanks: bool = False, restrict_manifest=False, table_manipulation: str = 'replace', table_column_names: str = 'class_label', annotation_keys: str = 'class_label', file_annotations_upload: bool = True) -> str:
2590    @tracer.start_as_current_span("SynapseStorage::associateMetadataWithFiles")
2591    def associateMetadataWithFiles(
2592        self,
2593        dmge: DataModelGraphExplorer,
2594        metadataManifestPath: str,
2595        datasetId: str,
2596        manifest_record_type: str = "table_file_and_entities",
2597        hideBlanks: bool = False,
2598        restrict_manifest=False,
2599        table_manipulation: str = "replace",
2600        table_column_names: str = "class_label",
2601        annotation_keys: str = "class_label",
2602        file_annotations_upload: bool = True,
2603    ) -> str:
2604        """Associate metadata with files in a storage dataset already on Synapse.
2605        Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file.
2606
2607        If this is a new manifest there could be no Synapse entities associated with the rows of this manifest
2608        this may be due to data type (e.g. clinical data) being tabular
2609        and not requiring files; to utilize uniform interfaces downstream
2610        (i.e. fileviews), a Synapse entity (a folder) is created for each row
2611        and an entity column is added to the manifest containing the resulting
2612        entity IDs; a table is also created at present as an additional interface
2613        for downstream query and interaction with the data.
2614
2615        Args:
2616            dmge: DataModelGraphExplorer Object
2617            metadataManifestPath: path to csv containing a validated metadata manifest.
2618            The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type.
2619            Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item.
2620            In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file.
2621            datasetId: synapse ID of folder containing the dataset
2622            manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination.
2623            hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2624            restrict_manifest (bool): Default is false. Flag for censored data.
2625            table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2626            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2627                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2628                display label formatting.
2629            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2630                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2631                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2632        Returns:
2633            manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2634        """
2635        # Read new manifest CSV:
2636        manifest = self._read_manifest(metadataManifestPath)
2637        manifest = self._add_id_columns_to_manifest(manifest, dmge)
2638
2639        table_name, component_name = self._generate_table_name(manifest)
2640
2641        # Upload manifest to synapse based on user input (manifest_record_type)
2642        if manifest_record_type == "file_only":
2643            manifest_synapse_file_id = self.upload_manifest_as_csv(
2644                dmge=dmge,
2645                manifest=manifest,
2646                metadataManifestPath=metadataManifestPath,
2647                datasetId=datasetId,
2648                restrict=restrict_manifest,
2649                hideBlanks=hideBlanks,
2650                manifest_record_type=manifest_record_type,
2651                component_name=component_name,
2652                annotation_keys=annotation_keys,
2653                file_annotations_upload=file_annotations_upload,
2654            )
2655        elif manifest_record_type == "table_and_file":
2656            manifest_synapse_file_id = self.upload_manifest_as_table(
2657                dmge=dmge,
2658                manifest=manifest,
2659                metadataManifestPath=metadataManifestPath,
2660                datasetId=datasetId,
2661                table_name=table_name,
2662                component_name=component_name,
2663                restrict=restrict_manifest,
2664                hideBlanks=hideBlanks,
2665                manifest_record_type=manifest_record_type,
2666                table_manipulation=table_manipulation,
2667                table_column_names=table_column_names,
2668                annotation_keys=annotation_keys,
2669                file_annotations_upload=file_annotations_upload,
2670            )
2671        elif manifest_record_type == "file_and_entities":
2672            manifest_synapse_file_id = self.upload_manifest_as_csv(
2673                dmge=dmge,
2674                manifest=manifest,
2675                metadataManifestPath=metadataManifestPath,
2676                datasetId=datasetId,
2677                restrict=restrict_manifest,
2678                hideBlanks=hideBlanks,
2679                manifest_record_type=manifest_record_type,
2680                component_name=component_name,
2681                annotation_keys=annotation_keys,
2682                file_annotations_upload=file_annotations_upload,
2683            )
2684        elif manifest_record_type == "table_file_and_entities":
2685            manifest_synapse_file_id = self.upload_manifest_combo(
2686                dmge=dmge,
2687                manifest=manifest,
2688                metadataManifestPath=metadataManifestPath,
2689                datasetId=datasetId,
2690                table_name=table_name,
2691                component_name=component_name,
2692                restrict=restrict_manifest,
2693                hideBlanks=hideBlanks,
2694                manifest_record_type=manifest_record_type,
2695                table_manipulation=table_manipulation,
2696                table_column_names=table_column_names,
2697                annotation_keys=annotation_keys,
2698                file_annotations_upload=file_annotations_upload,
2699            )
2700        else:
2701            raise ValueError("Please enter a valid manifest_record_type.")
2702        return manifest_synapse_file_id

Associate metadata with files in a storage dataset already on Synapse. Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file.

If this is a new manifest there could be no Synapse entities associated with the rows of this manifest this may be due to data type (e.g. clinical data) being tabular and not requiring files; to utilize uniform interfaces downstream (i.e. fileviews), a Synapse entity (a folder) is created for each row and an entity column is added to the manifest containing the resulting entity IDs; a table is also created at present as an additional interface for downstream query and interaction with the data.

Arguments:
  • dmge: DataModelGraphExplorer Object
  • metadataManifestPath: path to csv containing a validated metadata manifest.
  • The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type.
  • Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item.
  • In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file.
  • datasetId: synapse ID of folder containing the dataset
  • manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination.
  • hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
  • restrict_manifest (bool): Default is false. Flag for censored data.
  • table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
  • table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
  • annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
Returns:

manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.

def getTableAnnotations(self, table_id: str):
2704    def getTableAnnotations(self, table_id: str):
2705        """Generate dictionary of annotations for the given Synapse file.
2706        Synapse returns all custom annotations as lists since they
2707        can contain multiple values. In all cases, the values will
2708        be converted into strings and concatenated with ", ".
2709
2710        Args:
2711            fileId (str): Synapse ID for dataset file.
2712
2713        Returns:
2714            dict: Annotations as comma-separated strings.
2715        """
2716        try:
2717            entity = self.synapse_entity_tracker.get(
2718                synapse_id=table_id, syn=self.syn, download_file=False
2719            )
2720            is_table = entity.concreteType.endswith(".TableEntity")
2721            annotations_raw = entity.annotations
2722        except SynapseHTTPError:
2723            # If an error occurs with retrieving entity, skip it
2724            # This could be caused by a temporary file view that
2725            # was deleted since its ID was retrieved
2726            is_file, is_table = False, False
2727
2728        # Skip anything that isn't a file or folder
2729        if not (is_table):
2730            return None
2731
2732        annotations = self.getEntityAnnotations(table_id, entity, annotations_raw)
2733
2734        return annotations

Generate dictionary of annotations for the given Synapse file. Synapse returns all custom annotations as lists since they can contain multiple values. In all cases, the values will be converted into strings and concatenated with ", ".

Arguments:
  • fileId (str): Synapse ID for dataset file.
Returns:

dict: Annotations as comma-separated strings.

def getFileAnnotations(self, fileId: str) -> Dict[str, str]:
2736    def getFileAnnotations(self, fileId: str) -> Dict[str, str]:
2737        """Generate dictionary of annotations for the given Synapse file.
2738        Synapse returns all custom annotations as lists since they
2739        can contain multiple values. In all cases, the values will
2740        be converted into strings and concatenated with ", ".
2741
2742        Args:
2743            fileId (str): Synapse ID for dataset file.
2744
2745        Returns:
2746            dict: Annotations as comma-separated strings.
2747        """
2748
2749        # Get entity metadata, including annotations
2750        try:
2751            entity = self.synapse_entity_tracker.get(
2752                synapse_id=fileId, syn=self.syn, download_file=False
2753            )
2754            is_file = entity.concreteType.endswith(".FileEntity")
2755            is_folder = entity.concreteType.endswith(".Folder")
2756            annotations_raw = entity.annotations
2757        except SynapseHTTPError:
2758            # If an error occurs with retrieving entity, skip it
2759            # This could be caused by a temporary file view that
2760            # was deleted since its ID was retrieved
2761            is_file, is_folder = False, False
2762
2763        # Skip anything that isn't a file or folder
2764        if not (is_file or is_folder):
2765            return None
2766
2767        annotations = self.getEntityAnnotations(fileId, entity, annotations_raw)
2768
2769        return annotations

Generate dictionary of annotations for the given Synapse file. Synapse returns all custom annotations as lists since they can contain multiple values. In all cases, the values will be converted into strings and concatenated with ", ".

Arguments:
  • fileId (str): Synapse ID for dataset file.
Returns:

dict: Annotations as comma-separated strings.

def getEntityAnnotations(self, fileId, entity, annotations_raw):
2771    def getEntityAnnotations(self, fileId, entity, annotations_raw):
2772        # Extract annotations from their lists and stringify. For example:
2773        # {'YearofBirth': [1980], 'author': ['bruno', 'milen', 'sujay']}
2774        annotations = dict()
2775        for key, vals in annotations_raw.items():
2776            if isinstance(vals, list) and len(vals) == 1:
2777                annotations[key] = str(vals[0])
2778            else:
2779                annotations[key] = ", ".join(str(v) for v in vals)
2780
2781        # Add the file entity ID and eTag, which weren't lists
2782        assert fileId == entity.id, (
2783            "For some reason, the Synapse ID in the response doesn't match"
2784            "the Synapse ID sent in the request (via synapseclient)."
2785        )
2786        annotations["entityId"] = fileId
2787        annotations["eTag"] = entity.etag
2788
2789        return annotations
def getDatasetAnnotations( self, datasetId: str, fill_na: bool = True, force_batch: bool = False) -> pandas.core.frame.DataFrame:
2791    def getDatasetAnnotations(
2792        self, datasetId: str, fill_na: bool = True, force_batch: bool = False
2793    ) -> pd.DataFrame:
2794        """Generate table for annotations across all files in given dataset.
2795
2796        Args:
2797            datasetId (str): Synapse ID for dataset folder.
2798            fill_na (bool): Whether to replace missing values with
2799                blank strings.
2800            force_batch (bool): Whether to force the function to use
2801                the batch mode, which uses a file view to retrieve
2802                annotations for a given dataset. Default to False
2803                unless there are more than 50 files in the dataset.
2804
2805        Returns:
2806            pd.DataFrame: Table of annotations.
2807        """
2808        # Get all files in given dataset
2809        dataset_files = self.getFilesInStorageDataset(datasetId)
2810
2811        # if there are no dataset files, there are no annotations
2812        # return None
2813        if not dataset_files:
2814            return pd.DataFrame()
2815
2816        dataset_files_map = dict(dataset_files)
2817        dataset_file_ids, _ = list(zip(*dataset_files))
2818
2819        # Get annotations for each file from Step 1
2820        # Batch mode
2821        try_batch = len(dataset_files) >= 50 or force_batch
2822        if try_batch:
2823            try:
2824                logger.info("Trying batch mode for retrieving Synapse annotations")
2825                table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids)
2826            except (SynapseAuthenticationError, SynapseHTTPError, ValueError):
2827                logger.info(
2828                    f"Unable to create a temporary file view bound to {datasetId}. "
2829                    "Defaulting to slower iterative retrieval of annotations."
2830                )
2831                # Default to the slower non-batch method
2832                logger.info("Batch mode failed (probably due to permission error)")
2833                try_batch = False
2834
2835        # Non-batch mode
2836        if not try_batch:
2837            logger.info("Using slower (non-batch) sequential mode")
2838            records = [self.getFileAnnotations(i) for i in dataset_file_ids]
2839            # Remove any annotations for non-file/folders (stored as None)
2840            records = filter(None, records)
2841            table = pd.DataFrame.from_records(records)
2842
2843        # Add filenames for the files that "survived" annotation retrieval
2844        filenames = [dataset_files_map[i] for i in table["entityId"]]
2845
2846        if "Filename" not in table.columns:
2847            table.insert(0, "Filename", filenames)
2848
2849        # Ensure that entityId and eTag are at the end
2850        entity_ids = table.pop("entityId")
2851        etags = table.pop("eTag")
2852        table.insert(len(table.columns), "entityId", entity_ids)
2853        table.insert(len(table.columns), "eTag", etags)
2854
2855        # Missing values are filled in with empty strings for Google Sheets
2856        if fill_na:
2857            table.fillna("", inplace=True)
2858
2859        # Force all values as strings
2860        return table.astype(str)

Generate table for annotations across all files in given dataset.

Arguments:
  • datasetId (str): Synapse ID for dataset folder.
  • fill_na (bool): Whether to replace missing values with blank strings.
  • force_batch (bool): Whether to force the function to use the batch mode, which uses a file view to retrieve annotations for a given dataset. Default to False unless there are more than 50 files in the dataset.
Returns:

pd.DataFrame: Table of annotations.

def raise_final_error(retry_state):
2862    def raise_final_error(retry_state):
2863        return retry_state.outcome.result()
def checkIfinAssetView(self, syn_id) -> str:
2865    def checkIfinAssetView(self, syn_id) -> str:
2866        # get data in administrative fileview for this pipeline
2867        assetViewTable = self.getStorageFileviewTable()
2868        all_files = list(assetViewTable["id"])
2869        if syn_id in all_files:
2870            return True
2871        else:
2872            return False
@tracer.start_as_current_span('SynapseStorage::getDatasetProject')
@retry(stop=stop_after_attempt(5), wait=wait_chain(*[wait_fixed(10) for i in range(2)] + [wait_fixed(15) for i in range(2)] + [wait_fixed(20)]), retry=retry_if_exception_type(LookupError), retry_error_callback=raise_final_error)
def getDatasetProject(self, datasetId: str) -> str:
2874    @tracer.start_as_current_span("SynapseStorage::getDatasetProject")
2875    @retry(
2876        stop=stop_after_attempt(5),
2877        wait=wait_chain(
2878            *[wait_fixed(10) for i in range(2)]
2879            + [wait_fixed(15) for i in range(2)]
2880            + [wait_fixed(20)]
2881        ),
2882        retry=retry_if_exception_type(LookupError),
2883        retry_error_callback=raise_final_error,
2884    )
2885    def getDatasetProject(self, datasetId: str) -> str:
2886        """Get parent project for a given dataset ID.
2887
2888        Args:
2889            datasetId (str): Synapse entity ID (folder or project).
2890
2891        Raises:
2892            ValueError: Raised if Synapse ID cannot be retrieved
2893            by the user or if it doesn't appear in the file view.
2894
2895        Returns:
2896            str: The Synapse ID for the parent project.
2897        """
2898
2899        # Subset main file view
2900        dataset_index = self.storageFileviewTable["id"] == datasetId
2901        dataset_row = self.storageFileviewTable[dataset_index]
2902
2903        # re-query if no datasets found
2904        if dataset_row.empty:
2905            sleep(5)
2906            self.query_fileview(force_requery=True)
2907            # Subset main file view
2908            dataset_index = self.storageFileviewTable["id"] == datasetId
2909            dataset_row = self.storageFileviewTable[dataset_index]
2910
2911        # Return `projectId` for given row if only one found
2912        if len(dataset_row) == 1:
2913            dataset_project = dataset_row["projectId"].values[0]
2914            return dataset_project
2915
2916        # Otherwise, check if already project itself
2917        try:
2918            syn_object = self.synapse_entity_tracker.get(
2919                synapse_id=datasetId, syn=self.syn, download_file=False
2920            )
2921            if syn_object.properties["concreteType"].endswith("Project"):
2922                return datasetId
2923        except SynapseHTTPError:
2924            raise PermissionError(
2925                f"The given dataset ({datasetId}) isn't accessible with this "
2926                "user. This might be caused by a typo in the dataset Synapse ID."
2927            )
2928
2929        # If not, then assume dataset not in file view
2930        raise LookupError(
2931            f"The given dataset ({datasetId}) doesn't appear in the "
2932            f"configured file view ({self.storageFileview}). This might "
2933            "mean that the file view's scope needs to be updated."
2934        )

Get parent project for a given dataset ID.

Arguments:
  • datasetId (str): Synapse entity ID (folder or project).
Raises:
  • ValueError: Raised if Synapse ID cannot be retrieved
  • by the user or if it doesn't appear in the file view.
Returns:

str: The Synapse ID for the parent project.

def getDatasetAnnotationsBatch( self, datasetId: str, dataset_file_ids: Sequence[str] = None) -> pandas.core.frame.DataFrame:
2936    def getDatasetAnnotationsBatch(
2937        self, datasetId: str, dataset_file_ids: Sequence[str] = None
2938    ) -> pd.DataFrame:
2939        """Generate table for annotations across all files in given dataset.
2940        This function uses a temporary file view to generate a table
2941        instead of iteratively querying for individual entity annotations.
2942        This function is expected to run much faster than
2943        `self.getDatasetAnnotationsBatch` on large datasets.
2944
2945        Args:
2946            datasetId (str): Synapse ID for dataset folder.
2947            dataset_file_ids (Sequence[str]): List of Synapse IDs
2948                for dataset files/folders used to subset the table.
2949
2950        Returns:
2951            pd.DataFrame: Table of annotations.
2952        """
2953        # Create data frame from annotations file view
2954        with DatasetFileView(datasetId, self.syn) as fileview:
2955            table = fileview.query()
2956
2957        if dataset_file_ids:
2958            table = table.loc[table.index.intersection(dataset_file_ids)]
2959
2960        table = table.reset_index(drop=True)
2961
2962        return table

Generate table for annotations across all files in given dataset. This function uses a temporary file view to generate a table instead of iteratively querying for individual entity annotations. This function is expected to run much faster than self.getDatasetAnnotationsBatch on large datasets.

Arguments:
  • datasetId (str): Synapse ID for dataset folder.
  • dataset_file_ids (Sequence[str]): List of Synapse IDs for dataset files/folders used to subset the table.
Returns:

pd.DataFrame: Table of annotations.

class TableOperations:
2975class TableOperations:
2976    """
2977    Object to hold functions for various table operations specific to the Synapse Asset Store.
2978
2979    Currently implement operations are:
2980    createTable: upload a manifest as a new table when none exist
2981    replaceTable: replace a metadata in a table from one manifest with metadata from another manifest
2982    updateTable: add a column to a table that already exists on synapse
2983
2984    Operations currently in development are:
2985    upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest
2986    """
2987
2988    def __init__(
2989        self,
2990        synStore: SynapseStorage,
2991        tableToLoad: pd.DataFrame = None,
2992        tableName: str = None,
2993        datasetId: str = None,
2994        existingTableId: str = None,
2995        restrict: bool = False,
2996        synapse_entity_tracker: SynapseEntityTracker = None,
2997    ):
2998        """
2999        Class governing table operations (creation, replacement, upserts, updates) in schematic
3000
3001        tableToLoad: manifest formatted appropriately for the table
3002        tableName: name of the table to be uploaded
3003        datasetId: synID of the dataset for the manifest
3004        existingTableId: synId of the table currently exising on synapse (if there is one)
3005        restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
3006        synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
3007
3008        """
3009        self.synStore = synStore
3010        self.tableToLoad = tableToLoad
3011        self.tableName = tableName
3012        self.datasetId = datasetId
3013        self.existingTableId = existingTableId
3014        self.restrict = restrict
3015        self.synapse_entity_tracker = synapse_entity_tracker or SynapseEntityTracker()
3016
3017    @tracer.start_as_current_span("TableOperations::createTable")
3018    def createTable(
3019        self,
3020        columnTypeDict: dict = None,
3021        specifySchema: bool = True,
3022    ):
3023        """
3024        Method to create a table from a metadata manifest and upload it to synapse
3025
3026        Args:
3027            columnTypeDict: dictionary schema for table columns: type, size, etc
3028            specifySchema: to specify a specific schema for the table format
3029
3030        Returns:
3031            table.schema.id: synID of the newly created table
3032        """
3033        datasetEntity = self.synapse_entity_tracker.get(
3034            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3035        )
3036        datasetName = datasetEntity.name
3037        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3038
3039        if not self.tableName:
3040            self.tableName = datasetName + "table"
3041        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3042        if specifySchema:
3043            if columnTypeDict == {}:
3044                logger.error("Did not provide a columnTypeDict.")
3045            # create list of columns:
3046            cols = []
3047            for col in self.tableToLoad.columns:
3048                if col in table_schema_by_cname:
3049                    col_type = table_schema_by_cname[col]["columnType"]
3050                    max_size = (
3051                        table_schema_by_cname[col]["maximumSize"]
3052                        if "maximumSize" in table_schema_by_cname[col].keys()
3053                        else 100
3054                    )
3055                    max_list_len = 250
3056                    if max_size and max_list_len:
3057                        cols.append(
3058                            Column(
3059                                name=col,
3060                                columnType=col_type,
3061                                maximumSize=max_size,
3062                                maximumListLength=max_list_len,
3063                            )
3064                        )
3065                    elif max_size:
3066                        cols.append(
3067                            Column(name=col, columnType=col_type, maximumSize=max_size)
3068                        )
3069                    else:
3070                        cols.append(Column(name=col, columnType=col_type))
3071                else:
3072                    # TODO add warning that the given col was not found and it's max size is set to 100
3073                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3074            schema = Schema(
3075                name=self.tableName, columns=cols, parent=datasetParentProject
3076            )
3077            table = Table(schema, self.tableToLoad)
3078            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3079            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3080            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3081            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3082            return table.schema.id
3083        else:
3084            # For just uploading the tables to synapse using default
3085            # column types.
3086            table = build_table(self.tableName, datasetParentProject, self.tableToLoad)
3087            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3088            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3089            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3090            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3091            return table.schema.id
3092
3093    @tracer.start_as_current_span("TableOperations::replaceTable")
3094    def replaceTable(
3095        self,
3096        specifySchema: bool = True,
3097        columnTypeDict: dict = None,
3098    ):
3099        """
3100        Method to replace an existing table on synapse with metadata from a new manifest
3101
3102        Args:
3103            specifySchema: to infer a schema for the table format
3104            columnTypeDict: dictionary schema for table columns: type, size, etc
3105
3106        Returns:
3107           existingTableId: synID of the already existing table that had its metadata replaced
3108        """
3109        datasetEntity = self.synapse_entity_tracker.get(
3110            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3111        )
3112
3113        datasetName = datasetEntity.name
3114        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3115        existing_table, existing_results = self.synStore.get_synapse_table(
3116            self.existingTableId
3117        )
3118        # remove rows
3119        self.synStore.syn.delete(existing_results)
3120        # Data changes such as removing all rows causes the eTag to change.
3121        self.synapse_entity_tracker.remove(synapse_id=self.existingTableId)
3122        # wait for row deletion to finish on synapse before getting empty table
3123        sleep(10)
3124
3125        # removes all current columns
3126        current_table = self.synapse_entity_tracker.get(
3127            synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False
3128        )
3129
3130        current_columns = self.synStore.syn.getTableColumns(current_table)
3131        for col in current_columns:
3132            current_table.removeColumn(col)
3133
3134        if not self.tableName:
3135            self.tableName = datasetName + "table"
3136
3137        # Process columns according to manifest entries
3138        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3139        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3140        if specifySchema:
3141            if columnTypeDict == {}:
3142                logger.error("Did not provide a columnTypeDict.")
3143            # create list of columns:
3144            cols = []
3145
3146            for col in self.tableToLoad.columns:
3147                if col in table_schema_by_cname:
3148                    col_type = table_schema_by_cname[col]["columnType"]
3149                    max_size = (
3150                        table_schema_by_cname[col]["maximumSize"]
3151                        if "maximumSize" in table_schema_by_cname[col].keys()
3152                        else 100
3153                    )
3154                    max_list_len = 250
3155                    if max_size and max_list_len:
3156                        cols.append(
3157                            Column(
3158                                name=col,
3159                                columnType=col_type,
3160                                maximumSize=max_size,
3161                                maximumListLength=max_list_len,
3162                            )
3163                        )
3164                    elif max_size:
3165                        cols.append(
3166                            Column(name=col, columnType=col_type, maximumSize=max_size)
3167                        )
3168                    else:
3169                        cols.append(Column(name=col, columnType=col_type))
3170                else:
3171                    # TODO add warning that the given col was not found and it's max size is set to 100
3172                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3173
3174            # adds new columns to schema
3175            for col in cols:
3176                current_table.addColumn(col)
3177            table_result = self.synStore.syn.store(
3178                current_table, isRestricted=self.restrict
3179            )
3180            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3181            # self.synapse_entity_tracker.add(synapse_id=table_result.schema.id, entity=table_result.schema)
3182            self.synapse_entity_tracker.remove(synapse_id=table_result.id)
3183
3184            # wait for synapse store to finish
3185            sleep(1)
3186
3187            # build schema and table from columns and store with necessary restrictions
3188            schema = Schema(
3189                name=self.tableName, columns=cols, parent=datasetParentProject
3190            )
3191            schema.id = self.existingTableId
3192            table = Table(schema, self.tableToLoad, etag=existing_results.etag)
3193            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3194            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3195            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3196            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3197        else:
3198            logging.error("Must specify a schema for table replacements")
3199
3200        # remove system metadata from manifest
3201        existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True)
3202        return self.existingTableId
3203
3204    @tracer.start_as_current_span("TableOperations::_get_auth_token")
3205    def _get_auth_token(
3206        self,
3207    ):
3208        authtoken = None
3209
3210        # Get access token from environment variable if available
3211        # Primarily useful for testing environments, with other possible usefulness for containers
3212        env_access_token = os.getenv("SYNAPSE_ACCESS_TOKEN")
3213        if env_access_token:
3214            authtoken = env_access_token
3215            return authtoken
3216
3217        # Get token from authorization header
3218        # Primarily useful for API endpoint functionality
3219        if "Authorization" in self.synStore.syn.default_headers:
3220            authtoken = self.synStore.syn.default_headers["Authorization"].split(
3221                "Bearer "
3222            )[-1]
3223            return authtoken
3224
3225        # retrive credentials from synapse object
3226        # Primarily useful for local users, could only be stored here when a .synapseConfig file is used, but including to be safe
3227        synapse_object_creds = self.synStore.syn.credentials
3228        if hasattr(synapse_object_creds, "_token"):
3229            authtoken = synapse_object_creds.secret
3230
3231        # Try getting creds from .synapseConfig file if it exists
3232        # Primarily useful for local users. Seems to correlate with credentials stored in synaspe object when logged in
3233        if os.path.exists(CONFIG.synapse_configuration_path):
3234            config = get_config_file(CONFIG.synapse_configuration_path)
3235
3236            # check which credentials are provided in file
3237            if config.has_option("authentication", "authtoken"):
3238                authtoken = config.get("authentication", "authtoken")
3239
3240        # raise error if required credentials are not found
3241        if not authtoken:
3242            raise NameError(
3243                "authtoken credentials could not be found in the environment, synapse object, or the .synapseConfig file"
3244            )
3245
3246        return authtoken
3247
3248    @tracer.start_as_current_span("TableOperations::upsertTable")
3249    def upsertTable(self, dmge: DataModelGraphExplorer):
3250        """
3251        Method to upsert rows from a new manifest into an existing table on synapse
3252        For upsert functionality to work, primary keys must follow the naming convention of <componenet>_id
3253        `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality.
3254        Currently it is required to use -dl/--use_display_label with table upserts.
3255
3256
3257        Args:
3258            dmge: DataModelGraphExplorer instance
3259
3260        Returns:
3261           existingTableId: synID of the already existing table that had its metadata replaced
3262        """
3263
3264        authtoken = self._get_auth_token()
3265
3266        synapseDB = SynapseDatabase(
3267            auth_token=authtoken,
3268            project_id=self.synStore.getDatasetProject(self.datasetId),
3269            syn=self.synStore.syn,
3270            synapse_entity_tracker=self.synapse_entity_tracker,
3271        )
3272
3273        try:
3274            # Try performing upsert
3275            synapseDB.upsert_table_rows(
3276                table_name=self.tableName, data=self.tableToLoad
3277            )
3278        except SynapseHTTPError as ex:
3279            # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload
3280            if "Id is not a valid column name or id" in str(ex):
3281                self._update_table_uuid_column(dmge)
3282                synapseDB.upsert_table_rows(
3283                    table_name=self.tableName, data=self.tableToLoad
3284                )
3285            # Raise if other error
3286            else:
3287                raise ex
3288
3289        return self.existingTableId
3290
3291    @tracer.start_as_current_span("TableOperations::_update_table_uuid_column")
3292    def _update_table_uuid_column(
3293        self,
3294        dmge: DataModelGraphExplorer,
3295    ) -> None:
3296        """Removes the `Uuid` column when present, and relpaces with an `Id` column
3297        Used to enable backwards compatability for manifests using the old `Uuid` convention
3298
3299        Args:
3300            dmge: DataModelGraphExplorer instance
3301
3302        Returns:
3303            None
3304        """
3305
3306        # Get the columns of the schema
3307        schema = self.synapse_entity_tracker.get(
3308            synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False
3309        )
3310
3311        cols = self.synStore.syn.getTableColumns(schema)
3312
3313        # Iterate through columns until `Uuid` column is found
3314        for col in cols:
3315            if col.name.lower() == "uuid":
3316                # See if schema has `Uuid` column specified
3317                try:
3318                    uuid_col_in_schema = dmge.is_class_in_schema(col.name)
3319                except KeyError:
3320                    uuid_col_in_schema = False
3321
3322                # If there is, then create a new `Id` column from scratch
3323                if uuid_col_in_schema:
3324                    new_col = Column(columnType="STRING", maximumSize=64, name="Id")
3325                    schema.addColumn(new_col)
3326                    schema = self.synStore.syn.store(schema)
3327                    # self.synapse_entity_tracker.add(synapse_id=schema.id, entity=schema)
3328                    # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3329                    self.synapse_entity_tracker.remove(synapse_id=schema.id)
3330                # If there is not, then use the old `Uuid` column as a basis for the new `Id` column
3331                else:
3332                    # Build ColumnModel that will be used for new column
3333                    id_column = Column(
3334                        name="Id",
3335                        columnType="STRING",
3336                        maximumSize=64,
3337                        defaultValue=None,
3338                        maximumListLength=1,
3339                    )
3340                    new_col_response = self.synStore.syn.store(id_column)
3341
3342                    # Define columnChange body
3343                    columnChangeDict = {
3344                        "concreteType": "org.sagebionetworks.repo.model.table.TableSchemaChangeRequest",
3345                        "entityId": self.existingTableId,
3346                        "changes": [
3347                            {
3348                                "oldColumnId": col["id"],
3349                                "newColumnId": new_col_response["id"],
3350                            }
3351                        ],
3352                    }
3353
3354                    self.synStore.syn._async_table_update(
3355                        table=self.existingTableId,
3356                        changes=[columnChangeDict],
3357                        wait=False,
3358                    )
3359                break
3360
3361        return
3362
3363    @tracer.start_as_current_span("TableOperations::updateTable")
3364    def updateTable(
3365        self,
3366        update_col: str = "Id",
3367    ):
3368        """
3369        Method to update an existing table with a new column
3370
3371        Args:
3372            updateCol: column to index the old and new tables on
3373
3374        Returns:
3375           existingTableId: synID of the already existing table that had its metadata replaced
3376        """
3377        existing_table, existing_results = self.synStore.get_synapse_table(
3378            self.existingTableId
3379        )
3380
3381        self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col)
3382        # store table with existing etag data and impose restrictions as appropriate
3383        table_result = self.synStore.syn.store(
3384            Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag),
3385            isRestricted=self.restrict,
3386        )
3387        # We cannot store the Table to the `synapse_entity_tracker` because there is
3388        # not `Schema` on the table object. The above `.store()` function call would
3389        # also update the ETag of the entity within Synapse. Remove it from the tracker
3390        # and re-retrieve it later on if needed again.
3391        self.synapse_entity_tracker.remove(synapse_id=table_result.tableId)
3392
3393        return self.existingTableId

Object to hold functions for various table operations specific to the Synapse Asset Store.

Currently implement operations are: createTable: upload a manifest as a new table when none exist replaceTable: replace a metadata in a table from one manifest with metadata from another manifest updateTable: add a column to a table that already exists on synapse

Operations currently in development are: upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest

TableOperations( synStore: SynapseStorage, tableToLoad: pandas.core.frame.DataFrame = None, tableName: str = None, datasetId: str = None, existingTableId: str = None, restrict: bool = False, synapse_entity_tracker: schematic.store.synapse_tracker.SynapseEntityTracker = None)
2988    def __init__(
2989        self,
2990        synStore: SynapseStorage,
2991        tableToLoad: pd.DataFrame = None,
2992        tableName: str = None,
2993        datasetId: str = None,
2994        existingTableId: str = None,
2995        restrict: bool = False,
2996        synapse_entity_tracker: SynapseEntityTracker = None,
2997    ):
2998        """
2999        Class governing table operations (creation, replacement, upserts, updates) in schematic
3000
3001        tableToLoad: manifest formatted appropriately for the table
3002        tableName: name of the table to be uploaded
3003        datasetId: synID of the dataset for the manifest
3004        existingTableId: synId of the table currently exising on synapse (if there is one)
3005        restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
3006        synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
3007
3008        """
3009        self.synStore = synStore
3010        self.tableToLoad = tableToLoad
3011        self.tableName = tableName
3012        self.datasetId = datasetId
3013        self.existingTableId = existingTableId
3014        self.restrict = restrict
3015        self.synapse_entity_tracker = synapse_entity_tracker or SynapseEntityTracker()

Class governing table operations (creation, replacement, upserts, updates) in schematic

tableToLoad: manifest formatted appropriately for the table tableName: name of the table to be uploaded datasetId: synID of the dataset for the manifest existingTableId: synId of the table currently exising on synapse (if there is one) restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities

synStore
tableToLoad
tableName
datasetId
existingTableId
restrict
synapse_entity_tracker
@tracer.start_as_current_span('TableOperations::createTable')
def createTable(self, columnTypeDict: dict = None, specifySchema: bool = True):
3017    @tracer.start_as_current_span("TableOperations::createTable")
3018    def createTable(
3019        self,
3020        columnTypeDict: dict = None,
3021        specifySchema: bool = True,
3022    ):
3023        """
3024        Method to create a table from a metadata manifest and upload it to synapse
3025
3026        Args:
3027            columnTypeDict: dictionary schema for table columns: type, size, etc
3028            specifySchema: to specify a specific schema for the table format
3029
3030        Returns:
3031            table.schema.id: synID of the newly created table
3032        """
3033        datasetEntity = self.synapse_entity_tracker.get(
3034            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3035        )
3036        datasetName = datasetEntity.name
3037        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3038
3039        if not self.tableName:
3040            self.tableName = datasetName + "table"
3041        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3042        if specifySchema:
3043            if columnTypeDict == {}:
3044                logger.error("Did not provide a columnTypeDict.")
3045            # create list of columns:
3046            cols = []
3047            for col in self.tableToLoad.columns:
3048                if col in table_schema_by_cname:
3049                    col_type = table_schema_by_cname[col]["columnType"]
3050                    max_size = (
3051                        table_schema_by_cname[col]["maximumSize"]
3052                        if "maximumSize" in table_schema_by_cname[col].keys()
3053                        else 100
3054                    )
3055                    max_list_len = 250
3056                    if max_size and max_list_len:
3057                        cols.append(
3058                            Column(
3059                                name=col,
3060                                columnType=col_type,
3061                                maximumSize=max_size,
3062                                maximumListLength=max_list_len,
3063                            )
3064                        )
3065                    elif max_size:
3066                        cols.append(
3067                            Column(name=col, columnType=col_type, maximumSize=max_size)
3068                        )
3069                    else:
3070                        cols.append(Column(name=col, columnType=col_type))
3071                else:
3072                    # TODO add warning that the given col was not found and it's max size is set to 100
3073                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3074            schema = Schema(
3075                name=self.tableName, columns=cols, parent=datasetParentProject
3076            )
3077            table = Table(schema, self.tableToLoad)
3078            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3079            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3080            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3081            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3082            return table.schema.id
3083        else:
3084            # For just uploading the tables to synapse using default
3085            # column types.
3086            table = build_table(self.tableName, datasetParentProject, self.tableToLoad)
3087            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3088            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3089            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3090            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3091            return table.schema.id

Method to create a table from a metadata manifest and upload it to synapse

Arguments:
  • columnTypeDict: dictionary schema for table columns: type, size, etc
  • specifySchema: to specify a specific schema for the table format
Returns:

table.schema.id: synID of the newly created table

@tracer.start_as_current_span('TableOperations::replaceTable')
def replaceTable(self, specifySchema: bool = True, columnTypeDict: dict = None):
3093    @tracer.start_as_current_span("TableOperations::replaceTable")
3094    def replaceTable(
3095        self,
3096        specifySchema: bool = True,
3097        columnTypeDict: dict = None,
3098    ):
3099        """
3100        Method to replace an existing table on synapse with metadata from a new manifest
3101
3102        Args:
3103            specifySchema: to infer a schema for the table format
3104            columnTypeDict: dictionary schema for table columns: type, size, etc
3105
3106        Returns:
3107           existingTableId: synID of the already existing table that had its metadata replaced
3108        """
3109        datasetEntity = self.synapse_entity_tracker.get(
3110            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3111        )
3112
3113        datasetName = datasetEntity.name
3114        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3115        existing_table, existing_results = self.synStore.get_synapse_table(
3116            self.existingTableId
3117        )
3118        # remove rows
3119        self.synStore.syn.delete(existing_results)
3120        # Data changes such as removing all rows causes the eTag to change.
3121        self.synapse_entity_tracker.remove(synapse_id=self.existingTableId)
3122        # wait for row deletion to finish on synapse before getting empty table
3123        sleep(10)
3124
3125        # removes all current columns
3126        current_table = self.synapse_entity_tracker.get(
3127            synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False
3128        )
3129
3130        current_columns = self.synStore.syn.getTableColumns(current_table)
3131        for col in current_columns:
3132            current_table.removeColumn(col)
3133
3134        if not self.tableName:
3135            self.tableName = datasetName + "table"
3136
3137        # Process columns according to manifest entries
3138        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3139        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3140        if specifySchema:
3141            if columnTypeDict == {}:
3142                logger.error("Did not provide a columnTypeDict.")
3143            # create list of columns:
3144            cols = []
3145
3146            for col in self.tableToLoad.columns:
3147                if col in table_schema_by_cname:
3148                    col_type = table_schema_by_cname[col]["columnType"]
3149                    max_size = (
3150                        table_schema_by_cname[col]["maximumSize"]
3151                        if "maximumSize" in table_schema_by_cname[col].keys()
3152                        else 100
3153                    )
3154                    max_list_len = 250
3155                    if max_size and max_list_len:
3156                        cols.append(
3157                            Column(
3158                                name=col,
3159                                columnType=col_type,
3160                                maximumSize=max_size,
3161                                maximumListLength=max_list_len,
3162                            )
3163                        )
3164                    elif max_size:
3165                        cols.append(
3166                            Column(name=col, columnType=col_type, maximumSize=max_size)
3167                        )
3168                    else:
3169                        cols.append(Column(name=col, columnType=col_type))
3170                else:
3171                    # TODO add warning that the given col was not found and it's max size is set to 100
3172                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3173
3174            # adds new columns to schema
3175            for col in cols:
3176                current_table.addColumn(col)
3177            table_result = self.synStore.syn.store(
3178                current_table, isRestricted=self.restrict
3179            )
3180            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3181            # self.synapse_entity_tracker.add(synapse_id=table_result.schema.id, entity=table_result.schema)
3182            self.synapse_entity_tracker.remove(synapse_id=table_result.id)
3183
3184            # wait for synapse store to finish
3185            sleep(1)
3186
3187            # build schema and table from columns and store with necessary restrictions
3188            schema = Schema(
3189                name=self.tableName, columns=cols, parent=datasetParentProject
3190            )
3191            schema.id = self.existingTableId
3192            table = Table(schema, self.tableToLoad, etag=existing_results.etag)
3193            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3194            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3195            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3196            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3197        else:
3198            logging.error("Must specify a schema for table replacements")
3199
3200        # remove system metadata from manifest
3201        existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True)
3202        return self.existingTableId

Method to replace an existing table on synapse with metadata from a new manifest

Arguments:
  • specifySchema: to infer a schema for the table format
  • columnTypeDict: dictionary schema for table columns: type, size, etc
Returns:

existingTableId: synID of the already existing table that had its metadata replaced

@tracer.start_as_current_span('TableOperations::upsertTable')
def upsertTable( self, dmge: schematic.schemas.data_model_graph.DataModelGraphExplorer):
3248    @tracer.start_as_current_span("TableOperations::upsertTable")
3249    def upsertTable(self, dmge: DataModelGraphExplorer):
3250        """
3251        Method to upsert rows from a new manifest into an existing table on synapse
3252        For upsert functionality to work, primary keys must follow the naming convention of <componenet>_id
3253        `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality.
3254        Currently it is required to use -dl/--use_display_label with table upserts.
3255
3256
3257        Args:
3258            dmge: DataModelGraphExplorer instance
3259
3260        Returns:
3261           existingTableId: synID of the already existing table that had its metadata replaced
3262        """
3263
3264        authtoken = self._get_auth_token()
3265
3266        synapseDB = SynapseDatabase(
3267            auth_token=authtoken,
3268            project_id=self.synStore.getDatasetProject(self.datasetId),
3269            syn=self.synStore.syn,
3270            synapse_entity_tracker=self.synapse_entity_tracker,
3271        )
3272
3273        try:
3274            # Try performing upsert
3275            synapseDB.upsert_table_rows(
3276                table_name=self.tableName, data=self.tableToLoad
3277            )
3278        except SynapseHTTPError as ex:
3279            # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload
3280            if "Id is not a valid column name or id" in str(ex):
3281                self._update_table_uuid_column(dmge)
3282                synapseDB.upsert_table_rows(
3283                    table_name=self.tableName, data=self.tableToLoad
3284                )
3285            # Raise if other error
3286            else:
3287                raise ex
3288
3289        return self.existingTableId

Method to upsert rows from a new manifest into an existing table on synapse For upsert functionality to work, primary keys must follow the naming convention of _id -tm upsert should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality. Currently it is required to use -dl/--use_display_label with table upserts.

Arguments:
  • dmge: DataModelGraphExplorer instance
Returns:

existingTableId: synID of the already existing table that had its metadata replaced

@tracer.start_as_current_span('TableOperations::updateTable')
def updateTable(self, update_col: str = 'Id'):
3363    @tracer.start_as_current_span("TableOperations::updateTable")
3364    def updateTable(
3365        self,
3366        update_col: str = "Id",
3367    ):
3368        """
3369        Method to update an existing table with a new column
3370
3371        Args:
3372            updateCol: column to index the old and new tables on
3373
3374        Returns:
3375           existingTableId: synID of the already existing table that had its metadata replaced
3376        """
3377        existing_table, existing_results = self.synStore.get_synapse_table(
3378            self.existingTableId
3379        )
3380
3381        self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col)
3382        # store table with existing etag data and impose restrictions as appropriate
3383        table_result = self.synStore.syn.store(
3384            Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag),
3385            isRestricted=self.restrict,
3386        )
3387        # We cannot store the Table to the `synapse_entity_tracker` because there is
3388        # not `Schema` on the table object. The above `.store()` function call would
3389        # also update the ETag of the entity within Synapse. Remove it from the tracker
3390        # and re-retrieve it later on if needed again.
3391        self.synapse_entity_tracker.remove(synapse_id=table_result.tableId)
3392
3393        return self.existingTableId

Method to update an existing table with a new column

Arguments:
  • updateCol: column to index the old and new tables on
Returns:

existingTableId: synID of the already existing table that had its metadata replaced

class DatasetFileView:
3396class DatasetFileView:
3397    """Helper class to create temporary dataset file views.
3398    This class can be used in conjunction with a 'with' statement.
3399    This will ensure that the file view is deleted automatically.
3400    See SynapseStorage.getDatasetAnnotationsBatch for example usage.
3401    """
3402
3403    def __init__(
3404        self,
3405        datasetId: str,
3406        synapse: Synapse,
3407        name: str = None,
3408        temporary: bool = True,
3409        parentId: str = None,
3410    ) -> None:
3411        """Create a file view scoped to a dataset folder.
3412
3413        Args:
3414            datasetId (str): Synapse ID for a dataset folder/project.
3415            synapse (Synapse): Used for Synapse requests.
3416            name (str): Name of the file view (temporary or not).
3417            temporary (bool): Whether to delete the file view on exit
3418                of either a 'with' statement or Python entirely.
3419            parentId (str, optional): Synapse ID specifying where to
3420                store the file view. Defaults to datasetId.
3421        """
3422
3423        self.datasetId = datasetId
3424        self.synapse = synapse
3425        self.is_temporary = temporary
3426
3427        if name is None:
3428            self.name = f"schematic annotation file view for {self.datasetId}"
3429
3430        if self.is_temporary:
3431            uid = secrets.token_urlsafe(5)
3432            self.name = f"{self.name} - UID {uid}"
3433
3434        # TODO: Allow a DCC admin to configure a "universal parent"
3435        #       Such as a Synapse project writeable by everyone.
3436        self.parentId = datasetId if parentId is None else parentId
3437
3438        # TODO: Create local sharing setting to hide from everyone else
3439        view_schema = EntityViewSchema(
3440            name=self.name,
3441            parent=self.parentId,
3442            scopes=self.datasetId,
3443            includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER],
3444            addDefaultViewColumns=False,
3445            addAnnotationColumns=True,
3446        )
3447
3448        # TODO: Handle failure due to insufficient permissions by
3449        #       creating a temporary new project to store view
3450        self.view_schema = self.synapse.store(view_schema)
3451
3452        # These are filled in after calling `self.query()`
3453        self.results = None
3454        self.table = None
3455
3456        # Ensure deletion of the file view (last resort)
3457        if self.is_temporary:
3458            atexit.register(self.delete)
3459
3460    def __enter__(self):
3461        """Return file view when entering 'with' statement."""
3462        return self
3463
3464    def __exit__(self, exc_type, exc_value, traceback):
3465        """Delete file view when exiting 'with' statement."""
3466        if self.is_temporary:
3467            self.delete()
3468
3469    def delete(self):
3470        """Delete the file view on Synapse without deleting local table."""
3471        if self.view_schema is not None:
3472            self.synapse.delete(self.view_schema)
3473            self.view_schema = None
3474
3475    def query(self, tidy=True, force=False):
3476        """Retrieve file view as a data frame (raw format sans index)."""
3477        if self.table is None or force:
3478            fileview_id = self.view_schema["id"]
3479            self.results = self.synapse.tableQuery(f"select * from {fileview_id}")
3480            self.table = self.results.asDataFrame(
3481                rowIdAndVersionInIndex=False,
3482                na_values=STR_NA_VALUES_FILTERED,
3483                keep_default_na=False,
3484            )
3485        if tidy:
3486            self.tidy_table()
3487        return self.table
3488
3489    def tidy_table(self):
3490        """Convert raw file view data frame into more usable format."""
3491        assert self.table is not None, "Must call `self.query()` first."
3492        self._fix_default_columns()
3493        self._fix_list_columns()
3494        self._fix_int_columns()
3495        return self.table
3496
3497    def _fix_default_columns(self):
3498        """Rename default columns to match schematic expectations."""
3499
3500        # Drop ROW_VERSION column if present
3501        if "ROW_VERSION" in self.table:
3502            del self.table["ROW_VERSION"]
3503
3504        # Rename id column to entityId and set as data frame index
3505        if "ROW_ID" in self.table:
3506            self.table["entityId"] = "syn" + self.table["ROW_ID"].astype(str)
3507            self.table = self.table.set_index("entityId", drop=False)
3508            del self.table["ROW_ID"]
3509
3510        # Rename ROW_ETAG column to eTag and place at end of data frame
3511        if "ROW_ETAG" in self.table:
3512            row_etags = self.table.pop("ROW_ETAG")
3513
3514            # eTag column may already present if users annotated data without submitting manifest
3515            # we're only concerned with the new values and not the existing ones
3516            if "eTag" in self.table:
3517                del self.table["eTag"]
3518
3519            self.table.insert(len(self.table.columns), "eTag", row_etags)
3520
3521        return self.table
3522
3523    def _get_columns_of_type(self, types):
3524        """Helper function to get list of columns of a given type(s)."""
3525        matching_columns = []
3526        for header in self.results.headers:
3527            if header.columnType in types:
3528                matching_columns.append(header.name)
3529        return matching_columns
3530
3531    def _fix_list_columns(self):
3532        """Fix formatting of list-columns."""
3533        list_types = {"STRING_LIST", "INTEGER_LIST", "BOOLEAN_LIST"}
3534        list_columns = self._get_columns_of_type(list_types)
3535        for col in list_columns:
3536            self.table[col] = self.table[col].apply(lambda x: ", ".join(x))
3537        return self.table
3538
3539    def _fix_int_columns(self):
3540        """Ensure that integer-columns are actually integers."""
3541        int_columns = self._get_columns_of_type({"INTEGER"})
3542        for col in int_columns:
3543            # Coercing to string because NaN is a floating point value
3544            # and cannot exist alongside integers in a column
3545            def to_int_fn(x):
3546                return "" if np.isnan(x) else str(int(x))
3547
3548            self.table[col] = self.table[col].apply(to_int_fn)
3549        return self.table

Helper class to create temporary dataset file views. This class can be used in conjunction with a 'with' statement. This will ensure that the file view is deleted automatically. See SynapseStorage.getDatasetAnnotationsBatch for example usage.

DatasetFileView( datasetId: str, synapse: synapseclient.client.Synapse, name: str = None, temporary: bool = True, parentId: str = None)
3403    def __init__(
3404        self,
3405        datasetId: str,
3406        synapse: Synapse,
3407        name: str = None,
3408        temporary: bool = True,
3409        parentId: str = None,
3410    ) -> None:
3411        """Create a file view scoped to a dataset folder.
3412
3413        Args:
3414            datasetId (str): Synapse ID for a dataset folder/project.
3415            synapse (Synapse): Used for Synapse requests.
3416            name (str): Name of the file view (temporary or not).
3417            temporary (bool): Whether to delete the file view on exit
3418                of either a 'with' statement or Python entirely.
3419            parentId (str, optional): Synapse ID specifying where to
3420                store the file view. Defaults to datasetId.
3421        """
3422
3423        self.datasetId = datasetId
3424        self.synapse = synapse
3425        self.is_temporary = temporary
3426
3427        if name is None:
3428            self.name = f"schematic annotation file view for {self.datasetId}"
3429
3430        if self.is_temporary:
3431            uid = secrets.token_urlsafe(5)
3432            self.name = f"{self.name} - UID {uid}"
3433
3434        # TODO: Allow a DCC admin to configure a "universal parent"
3435        #       Such as a Synapse project writeable by everyone.
3436        self.parentId = datasetId if parentId is None else parentId
3437
3438        # TODO: Create local sharing setting to hide from everyone else
3439        view_schema = EntityViewSchema(
3440            name=self.name,
3441            parent=self.parentId,
3442            scopes=self.datasetId,
3443            includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER],
3444            addDefaultViewColumns=False,
3445            addAnnotationColumns=True,
3446        )
3447
3448        # TODO: Handle failure due to insufficient permissions by
3449        #       creating a temporary new project to store view
3450        self.view_schema = self.synapse.store(view_schema)
3451
3452        # These are filled in after calling `self.query()`
3453        self.results = None
3454        self.table = None
3455
3456        # Ensure deletion of the file view (last resort)
3457        if self.is_temporary:
3458            atexit.register(self.delete)

Create a file view scoped to a dataset folder.

Arguments:
  • datasetId (str): Synapse ID for a dataset folder/project.
  • synapse (Synapse): Used for Synapse requests.
  • name (str): Name of the file view (temporary or not).
  • temporary (bool): Whether to delete the file view on exit of either a 'with' statement or Python entirely.
  • parentId (str, optional): Synapse ID specifying where to store the file view. Defaults to datasetId.
datasetId
synapse
is_temporary
parentId
view_schema
results
table
def delete(self):
3469    def delete(self):
3470        """Delete the file view on Synapse without deleting local table."""
3471        if self.view_schema is not None:
3472            self.synapse.delete(self.view_schema)
3473            self.view_schema = None

Delete the file view on Synapse without deleting local table.

def query(self, tidy=True, force=False):
3475    def query(self, tidy=True, force=False):
3476        """Retrieve file view as a data frame (raw format sans index)."""
3477        if self.table is None or force:
3478            fileview_id = self.view_schema["id"]
3479            self.results = self.synapse.tableQuery(f"select * from {fileview_id}")
3480            self.table = self.results.asDataFrame(
3481                rowIdAndVersionInIndex=False,
3482                na_values=STR_NA_VALUES_FILTERED,
3483                keep_default_na=False,
3484            )
3485        if tidy:
3486            self.tidy_table()
3487        return self.table

Retrieve file view as a data frame (raw format sans index).

def tidy_table(self):
3489    def tidy_table(self):
3490        """Convert raw file view data frame into more usable format."""
3491        assert self.table is not None, "Must call `self.query()` first."
3492        self._fix_default_columns()
3493        self._fix_list_columns()
3494        self._fix_int_columns()
3495        return self.table

Convert raw file view data frame into more usable format.