schematic.store.synapse

Synapse storage class

   1"""Synapse storage class"""
   2
   3import asyncio
   4import atexit
   5import logging
   6import os
   7import re
   8import secrets
   9import shutil
  10import time
  11import uuid  # used to generate unique names for entities
  12from copy import deepcopy
  13from dataclasses import dataclass, field
  14from time import sleep
  15
  16# allows specifying explicit variable types
  17from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
  18
  19import numpy as np
  20import pandas as pd
  21import synapseclient
  22from opentelemetry import trace
  23from synapseclient import Annotations as OldAnnotations
  24from synapseclient import (
  25    Column,
  26    EntityViewSchema,
  27    EntityViewType,
  28    File,
  29    Folder,
  30    Schema,
  31    Synapse,
  32    Table,
  33    as_table_columns,
  34)
  35from synapseclient.annotations import _convert_to_annotations_list
  36from synapseclient.api import get_config_file, get_entity_id_bundle2
  37from synapseclient.core.constants.concrete_types import PROJECT_ENTITY
  38from synapseclient.core.exceptions import (
  39    SynapseAuthenticationError,
  40    SynapseHTTPError,
  41    SynapseUnmetAccessRestrictions,
  42)
  43from synapseclient.models.annotations import Annotations
  44from synapseclient.table import CsvFileTable, Schema, build_table
  45from tenacity import (
  46    retry,
  47    retry_if_exception_type,
  48    stop_after_attempt,
  49    wait_chain,
  50    wait_fixed,
  51)
  52
  53from schematic.configuration.configuration import CONFIG
  54from schematic.exceptions import AccessCredentialsError
  55from schematic.schemas.data_model_graph import DataModelGraphExplorer
  56from schematic.store.base import BaseStorage
  57from schematic.store.database.synapse_database import SynapseDatabase
  58from schematic.store.synapse_tracker import SynapseEntityTracker
  59from schematic.utils.df_utils import (
  60    STR_NA_VALUES_FILTERED,
  61    col_in_dataframe,
  62    load_df,
  63    update_df,
  64)
  65
  66# entity_type_mapping, get_dir_size, create_temp_folder, check_synapse_cache_size, and clear_synapse_cache functions are used for AWS deployment
  67# Please do not remove these import statements
  68from schematic.utils.general import (
  69    check_synapse_cache_size,
  70    clear_synapse_cache,
  71    create_temp_folder,
  72    entity_type_mapping,
  73    get_dir_size,
  74)
  75from schematic.utils.io_utils import cleanup_temporary_storage
  76from schematic.utils.schema_utils import get_class_label_from_display_name
  77from schematic.utils.validate_utils import comma_separated_list_regex, rule_in_rule_list
  78
  79logger = logging.getLogger("Synapse storage")
  80
  81tracer = trace.get_tracer("Schematic")
  82
  83
  84@dataclass
  85class ManifestDownload(object):
  86    """
  87    syn: an object of type synapseclient.
  88    manifest_id: id of a manifest
  89    synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
  90    """
  91
  92    syn: synapseclient.Synapse
  93    manifest_id: str
  94    synapse_entity_tracker: SynapseEntityTracker = field(
  95        default_factory=SynapseEntityTracker
  96    )
  97
  98    def _download_manifest_to_folder(self, use_temporary_folder: bool = True) -> File:
  99        """
 100        Try downloading a manifest to a specific folder (temporary or not). When the
 101        `use_temporary_folder` is set to True, the manifest will be downloaded to a
 102        temporary folder. This is useful for when the code is running as an API server
 103        where multiple requests are being made at the same time. This will prevent
 104        multiple requests from overwriting the same manifest file. When the
 105        `use_temporary_folder` is set to False, the manifest will be downloaded to the
 106        default manifest folder.
 107
 108        Args:
 109            use_temporary_folder: boolean argument indicating if a temporary folder
 110                should be used to store the manifest file. This is useful when running
 111                this code as an API server where multiple requests could be made at the
 112                same time. This is set to False when the code is being used from the
 113                CLI. Defaults to True.
 114
 115        Return:
 116            manifest_data: A Synapse file entity of the downloaded manifest
 117        """
 118        manifest_data = self.synapse_entity_tracker.get(
 119            synapse_id=self.manifest_id,
 120            syn=self.syn,
 121            download_file=False,
 122            retrieve_if_not_present=False,
 123        )
 124        current_span = trace.get_current_span()
 125        if (
 126            manifest_data
 127            and (file_handle := manifest_data.get("_file_handle", None))
 128            and current_span.is_recording()
 129        ):
 130            current_span.set_attribute(
 131                "schematic.manifest_size", file_handle.get("contentSize", 0)
 132            )
 133
 134        if manifest_data and manifest_data.path:
 135            return manifest_data
 136
 137        if "SECRETS_MANAGER_SECRETS" in os.environ:
 138            temporary_manifest_storage = "/var/tmp/temp_manifest_download"
 139            cleanup_temporary_storage(
 140                temporary_manifest_storage, time_delta_seconds=3600
 141            )
 142            # create a new directory to store manifest
 143            if not os.path.exists(temporary_manifest_storage):
 144                os.mkdir(temporary_manifest_storage)
 145            # create temporary folders for storing manifests
 146            download_location = create_temp_folder(
 147                path=temporary_manifest_storage,
 148                prefix=f"{self.manifest_id}-{time.time()}-",
 149            )
 150        else:
 151            if use_temporary_folder:
 152                download_location = create_temp_folder(
 153                    path=CONFIG.manifest_folder,
 154                    prefix=f"{self.manifest_id}-{time.time()}-",
 155                )
 156            else:
 157                download_location = CONFIG.manifest_folder
 158
 159        manifest_data = self.synapse_entity_tracker.get(
 160            synapse_id=self.manifest_id,
 161            syn=self.syn,
 162            download_file=True,
 163            retrieve_if_not_present=True,
 164            download_location=download_location,
 165        )
 166
 167        # This is doing a rename of the downloaded file. The reason this is important
 168        # is that if we are re-using a file that was previously downloaded, but the
 169        # file had been renamed. The file downloaded from the Synapse client is just
 170        # a direct copy of that renamed file. This code will set the name of the file
 171        # to the original name that was used to download the file. Note: An MD5 checksum
 172        # of the file will still be performed so if the file has changed, it will be
 173        # downloaded again.
 174        filename = manifest_data._file_handle.fileName
 175        if filename != os.path.basename(manifest_data.path):
 176            parent_folder = os.path.dirname(manifest_data.path)
 177            manifest_original_name_and_path = os.path.join(parent_folder, filename)
 178
 179            self.syn.cache.remove(
 180                file_handle_id=manifest_data.dataFileHandleId, path=manifest_data.path
 181            )
 182            os.rename(manifest_data.path, manifest_original_name_and_path)
 183            manifest_data.path = manifest_original_name_and_path
 184            self.syn.cache.add(
 185                file_handle_id=manifest_data.dataFileHandleId,
 186                path=manifest_original_name_and_path,
 187                md5=manifest_data._file_handle.contentMd5,
 188            )
 189
 190        return manifest_data
 191
 192    def _entity_type_checking(self) -> str:
 193        """
 194        check the entity type of the id that needs to be downloaded
 195        Return:
 196             if the entity type is wrong, raise an error
 197        """
 198        # check the type of entity
 199        entity_type = entity_type_mapping(
 200            syn=self.syn,
 201            entity_id=self.manifest_id,
 202            synapse_entity_tracker=self.synapse_entity_tracker,
 203        )
 204        if entity_type != "file":
 205            logger.error(
 206                f"You are using entity type: {entity_type}. Please provide a file ID"
 207            )
 208
 209    def download_manifest(
 210        self,
 211        newManifestName: str = "",
 212        manifest_df: pd.DataFrame = pd.DataFrame(),
 213        use_temporary_folder: bool = True,
 214    ) -> Union[str, File]:
 215        """
 216        Download a manifest based on a given manifest id.
 217        Args:
 218            newManifestName(optional): new name of a manifest that gets downloaded.
 219            manifest_df(optional): a dataframe containing name and id of manifests in a given asset view
 220        Return:
 221            manifest_data: synapse entity file object
 222        """
 223
 224        # enables retrying if user does not have access to uncensored manifest
 225        # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location
 226        manifest_data = ""
 227
 228        # check entity type
 229        self._entity_type_checking()
 230
 231        # download a manifest
 232        try:
 233            manifest_data = self._download_manifest_to_folder(
 234                use_temporary_folder=use_temporary_folder
 235            )
 236        except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError):
 237            # if there's an error getting an uncensored manifest, try getting the censored manifest
 238            if not manifest_df.empty:
 239                censored_regex = re.compile(".*censored.*")
 240                censored = manifest_df["name"].str.contains(censored_regex)
 241                new_manifest_id = manifest_df[censored]["id"][0]
 242                self.manifest_id = new_manifest_id
 243                try:
 244                    manifest_data = self._download_manifest_to_folder(
 245                        use_temporary_folder=use_temporary_folder
 246                    )
 247                except (
 248                    SynapseUnmetAccessRestrictions,
 249                    SynapseAuthenticationError,
 250                ) as e:
 251                    raise PermissionError(
 252                        "You don't have access to censored and uncensored manifests in this dataset."
 253                    ) from e
 254            else:
 255                logger.error(
 256                    f"You don't have access to the requested resource: {self.manifest_id}"
 257                )
 258
 259        if newManifestName and os.path.exists(manifest_data.get("path")):
 260            # Rename the file we just made to the new name
 261            new_manifest_filename = newManifestName + ".csv"
 262
 263            # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest.
 264            parent_folder = os.path.dirname(manifest_data.get("path"))
 265
 266            new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename)
 267
 268            # Copy file to new location. The purpose of using a copy instead of a rename
 269            # is to avoid any potential issues with the file being used in another
 270            # process. This avoids any potential race or code cocurrency conditions.
 271            shutil.copyfile(src=manifest_data["path"], dst=new_manifest_path_name)
 272
 273            # Adding this to cache will allow us to re-use the already downloaded
 274            # manifest file for up to 1 hour.
 275            self.syn.cache.add(
 276                file_handle_id=manifest_data.dataFileHandleId,
 277                path=new_manifest_path_name,
 278                md5=manifest_data._file_handle.contentMd5,
 279            )
 280
 281            # Update file names/paths in manifest_data
 282            manifest_data["name"] = new_manifest_filename
 283            manifest_data["filename"] = new_manifest_filename
 284            manifest_data["path"] = new_manifest_path_name
 285
 286        return manifest_data
 287
 288
 289class SynapseStorage(BaseStorage):
 290    """Implementation of Storage interface for datasets/files stored on Synapse.
 291    Provides utilities to list files in a specific project; update files annotations, create fileviews, etc.
 292
 293    TODO: Need to define the interface and rename and/or refactor some of the methods below.
 294    """
 295
 296    @tracer.start_as_current_span("SynapseStorage::__init__")
 297    def __init__(
 298        self,
 299        token: Optional[str] = None,  # optional parameter retrieved from browser cookie
 300        access_token: Optional[str] = None,
 301        project_scope: Optional[list] = None,
 302        synapse_cache_path: Optional[str] = None,
 303        perform_query: Optional[bool] = True,
 304        columns: Optional[list] = None,
 305        where_clauses: Optional[list] = None,
 306    ) -> None:
 307        """Initializes a SynapseStorage object.
 308
 309        Args:
 310            token (Optional[str], optional):
 311              Optional token parameter as found in browser cookie upon login to synapse.
 312              Defaults to None.
 313            access_token (Optional[list], optional):
 314              Optional access token (personal or oauth).
 315              Defaults to None.
 316            project_scope (Optional[list], optional): Defaults to None.
 317            synapse_cache_path (Optional[str], optional):
 318              Location of synapse cache.
 319              Defaults to None.
 320        TODO:
 321            Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how `query_fileview` is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands.
 322        """
 323        self.syn = self.login(synapse_cache_path, access_token)
 324        current_span = trace.get_current_span()
 325        if current_span.is_recording():
 326            current_span.set_attribute("user.id", self.syn.credentials.owner_id)
 327        self.project_scope = project_scope
 328        self.storageFileview = CONFIG.synapse_master_fileview_id
 329        self.manifest = CONFIG.synapse_manifest_basename
 330        self.root_synapse_cache = self.syn.cache.cache_root_dir
 331        self.synapse_entity_tracker = SynapseEntityTracker()
 332        if perform_query:
 333            self.query_fileview(columns=columns, where_clauses=where_clauses)
 334
 335    # TODO: When moving this over to a regular cron-job the following logic should be
 336    # out of `manifest_download`:
 337    # if "SECRETS_MANAGER_SECRETS" in os.environ:
 338    #     temporary_manifest_storage = "/var/tmp/temp_manifest_download"
 339    #     cleanup_temporary_storage(temporary_manifest_storage, time_delta_seconds=3600)
 340    @tracer.start_as_current_span("SynapseStorage::_purge_synapse_cache")
 341    def _purge_synapse_cache(
 342        self, maximum_storage_allowed_cache_gb: int = 1, minute_buffer: int = 15
 343    ) -> None:
 344        """
 345        Purge synapse cache if it exceeds a certain size. Default to 1GB.
 346        Args:
 347            maximum_storage_allowed_cache_gb (int): the maximum storage allowed
 348              before purging cache. Default is 1 GB.
 349            minute_buffer (int): All files created this amount of time or older will be deleted
 350        """
 351        # try clearing the cache
 352        # scan a directory and check size of files
 353        if os.path.exists(self.root_synapse_cache):
 354            maximum_storage_allowed_cache_bytes = maximum_storage_allowed_cache_gb * (
 355                1024**3
 356            )
 357            nbytes = get_dir_size(self.root_synapse_cache)
 358            dir_size_bytes = check_synapse_cache_size(directory=self.root_synapse_cache)
 359            # if 1 GB has already been taken, purge cache before 15 min
 360            if dir_size_bytes >= maximum_storage_allowed_cache_bytes:
 361                num_of_deleted_files = clear_synapse_cache(
 362                    self.syn.cache, minutes=minute_buffer
 363                )
 364                logger.info(
 365                    f"{num_of_deleted_files}  files have been deleted from {self.root_synapse_cache}"
 366                )
 367            else:
 368                # on AWS, OS takes around 14-17% of our ephemeral storage (20GiB)
 369                # instead of guessing how much space that we left, print out .synapseCache here
 370                logger.info(f"the total size of .synapseCache is: {nbytes} bytes")
 371
 372    @tracer.start_as_current_span("SynapseStorage::query_fileview")
 373    def query_fileview(
 374        self,
 375        columns: Optional[list] = None,
 376        where_clauses: Optional[list] = None,
 377        force_requery: Optional[bool] = False,
 378    ) -> None:
 379        """
 380        Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute.
 381        Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes.
 382        Args:
 383            columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
 384            where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
 385            force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False.
 386        """
 387        self._purge_synapse_cache()
 388
 389        # Initialize to assume that the new fileview query will be different from what may already be stored. Initializes to True because generally one will not have already been performed
 390        self.new_query_different = True
 391
 392        # If a query has already been performed, store the query
 393        previous_query_built = hasattr(self, "fileview_query")
 394        if previous_query_built:
 395            previous_query = self.fileview_query
 396
 397        # Build a query with the current given parameters and check to see if it is different from the previous
 398        self._build_query(columns=columns, where_clauses=where_clauses)
 399        if previous_query_built:
 400            self.new_query_different = self.fileview_query != previous_query
 401
 402        # Only perform the query if it is different from the previous query or we are forcing new results to be retrieved
 403        if self.new_query_different or force_requery:
 404            try:
 405                self.storageFileviewTable = self.syn.tableQuery(
 406                    query=self.fileview_query,
 407                ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
 408            except SynapseHTTPError as exc:
 409                exception_text = str(exc)
 410                if "Unknown column path" in exception_text:
 411                    raise ValueError(
 412                        "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation."
 413                    )
 414                elif "Unknown column" in exception_text:
 415                    missing_column = exception_text.split("Unknown column ")[-1]
 416                    raise ValueError(
 417                        f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview."
 418                    )
 419                else:
 420                    raise AccessCredentialsError(self.storageFileview)
 421
 422    @staticmethod
 423    def build_clause_from_dataset_id(
 424        dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None
 425    ) -> str:
 426        """
 427        Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized.
 428        Args:
 429            dataset_id: Synapse ID of a dataset that should be used to limit the query
 430            dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query
 431        Returns:
 432            clause for the query or an empty string if no dataset ID is provided
 433        """
 434        # Calling this method without specifying synIDs will complete but will not scope the view
 435        if (not dataset_id) and (not dataset_folder_list):
 436            return ""
 437
 438        # This will be used to gather files under a dataset recursively with a fileview query instead of walking
 439        if dataset_folder_list:
 440            search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list)
 441            return f"parentId IN ({search_folders})"
 442
 443        # `dataset_id` should be provided when all files are stored directly under the dataset folder
 444        return f"parentId='{dataset_id}'"
 445
 446    def _build_query(
 447        self, columns: Optional[list] = None, where_clauses: Optional[list] = None
 448    ):
 449        """
 450        Method to build a query for Synapse FileViews
 451        Args:
 452            columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
 453            where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
 454            self.storageFileview (str): Synapse FileView ID
 455            self.project_scope (Optional[list], optional): List of project IDs to be used to scope the query. Defaults to None.
 456                Gets added to where_clauses, more included for backwards compatability and as a more user friendly way of subsetting the view in a simple way.
 457        """
 458        if columns is None:
 459            columns = []
 460        if where_clauses is None:
 461            where_clauses = []
 462
 463        if self.project_scope:
 464            project_scope_clause = f"projectId IN {tuple(self.project_scope + [''])}"
 465            where_clauses.append(project_scope_clause)
 466
 467        if where_clauses:
 468            where_clauses = " AND ".join(where_clauses)
 469            where_clauses = f"WHERE {where_clauses} ;"
 470        else:
 471            where_clauses = ";"
 472
 473        if columns:
 474            columns = ",".join(columns)
 475        else:
 476            columns = "*"
 477
 478        self.fileview_query = (
 479            f"SELECT {columns} FROM {self.storageFileview} {where_clauses}"
 480        )
 481
 482        return
 483
 484    @staticmethod
 485    @tracer.start_as_current_span("SynapseStorage::login")
 486    def login(
 487        synapse_cache_path: Optional[str] = None,
 488        access_token: Optional[str] = None,
 489    ) -> synapseclient.Synapse:
 490        """Login to Synapse
 491
 492        Args:
 493            access_token (Optional[str], optional): A synapse access token. Defaults to None.
 494            synapse_cache_path (Optional[str]): location of synapse cache
 495
 496        Raises:
 497            ValueError: If unable to loging with access token
 498
 499        Returns:
 500            synapseclient.Synapse: A Synapse object that is logged in
 501        """
 502        # If no token is provided, try retrieving access token from environment
 503        if not access_token:
 504            access_token = os.getenv("SYNAPSE_ACCESS_TOKEN")
 505
 506        # login using a token
 507        if access_token:
 508            try:
 509                syn = synapseclient.Synapse(
 510                    cache_root_dir=synapse_cache_path,
 511                    debug=False,
 512                    skip_checks=True,
 513                    cache_client=False,
 514                )
 515                syn.login(authToken=access_token, silent=True)
 516                current_span = trace.get_current_span()
 517                if current_span.is_recording():
 518                    current_span.set_attribute("user.id", syn.credentials.owner_id)
 519            except SynapseHTTPError as exc:
 520                raise ValueError(
 521                    "No access to resources. Please make sure that your token is correct"
 522                ) from exc
 523        else:
 524            # login using synapse credentials provided by user in .synapseConfig (default) file
 525            syn = synapseclient.Synapse(
 526                configPath=CONFIG.synapse_configuration_path,
 527                cache_root_dir=synapse_cache_path,
 528                debug=False,
 529                skip_checks=True,
 530                cache_client=False,
 531            )
 532            syn.login(silent=True)
 533            current_span = trace.get_current_span()
 534            if current_span.is_recording():
 535                current_span.set_attribute("user.id", syn.credentials.owner_id)
 536        return syn
 537
 538    def missing_entity_handler(method):
 539        def wrapper(*args, **kwargs):
 540            try:
 541                return method(*args, **kwargs)
 542            except SynapseHTTPError as ex:
 543                str_message = str(ex).replace("\n", "")
 544                if "trash" in str_message or "does not exist" in str_message:
 545                    logging.warning(str_message)
 546                    return None
 547                else:
 548                    raise ex
 549
 550        return wrapper
 551
 552    def async_missing_entity_handler(method):
 553        """Decorator to handle missing entities in async methods."""
 554
 555        async def wrapper(*args: Any, **kwargs: Any) -> Any:
 556            try:
 557                return await method(*args, **kwargs)
 558            except SynapseHTTPError as ex:
 559                str_message = str(ex).replace("\n", "")
 560                if "trash" in str_message or "does not exist" in str_message:
 561                    logging.warning(str_message)
 562                    return None
 563                else:
 564                    raise ex
 565
 566        return wrapper
 567
 568    def getStorageFileviewTable(self):
 569        """Returns the storageFileviewTable obtained during initialization."""
 570        return self.storageFileviewTable
 571
 572    def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]:
 573        """Gets the paginated results of the REST call to Synapse to check what projects the current user has access to.
 574
 575        Args:
 576            currentUserId: synapse id for the user whose projects we want to get.
 577
 578        Returns:
 579            A dictionary with a next page token and the results.
 580        """
 581        all_results = self.syn.restGET(
 582            "/projects/user/{principalId}".format(principalId=currentUserId)
 583        )
 584
 585        while (
 586            "nextPageToken" in all_results
 587        ):  # iterate over next page token in results while there is any
 588            results_token = self.syn.restGET(
 589                "/projects/user/{principalId}?nextPageToken={nextPageToken}".format(
 590                    principalId=currentUserId,
 591                    nextPageToken=all_results["nextPageToken"],
 592                )
 593            )
 594            all_results["results"].extend(results_token["results"])
 595
 596            if "nextPageToken" in results_token:
 597                all_results["nextPageToken"] = results_token["nextPageToken"]
 598            else:
 599                del all_results["nextPageToken"]
 600
 601        return all_results
 602
 603    @tracer.start_as_current_span("SynapseStorage::getStorageProjects")
 604    def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]:
 605        """Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute.
 606
 607        Returns:
 608            A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName).
 609        """
 610
 611        # get the set of all storage Synapse project accessible for this pipeline
 612        storageProjects = self.storageFileviewTable["projectId"].unique()
 613
 614        # get the set of storage Synapse project accessible for this user
 615        # get a list of projects from Synapse
 616        current_user_project_headers = self.synapse_entity_tracker.get_project_headers(
 617            current_user_id=self.syn.credentials.owner_id, syn=self.syn
 618        )
 619        project_id_to_name_dict = {}
 620        current_user_projects = []
 621        for project_header in current_user_project_headers:
 622            project_id_to_name_dict[project_header.get("id")] = project_header.get(
 623                "name"
 624            )
 625            current_user_projects.append(project_header.get("id"))
 626
 627        # find set of user projects that are also in this pipeline's storage projects set
 628        storageProjects = list(set(storageProjects) & set(current_user_projects))
 629
 630        # Limit projects to scope if specified
 631        if project_scope:
 632            storageProjects = list(set(storageProjects) & set(project_scope))
 633
 634            if not storageProjects:
 635                raise Warning(
 636                    f"There are no projects that the user has access to that match the criteria of the specified project scope: {project_scope}"
 637                )
 638
 639        # prepare a return list of project IDs and names
 640        projects = []
 641        for projectId in storageProjects:
 642            project_name_from_project_header = project_id_to_name_dict.get(projectId)
 643            projects.append((projectId, project_name_from_project_header))
 644
 645        sorted_projects_list = sorted(projects, key=lambda tup: tup[0])
 646
 647        return sorted_projects_list
 648
 649    @tracer.start_as_current_span("SynapseStorage::getStorageDatasetsInProject")
 650    def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]:
 651        """Gets all datasets in folder under a given storage project that the current user has access to.
 652
 653        Args:
 654            projectId: synapse ID of a storage project.
 655
 656        Returns:
 657            A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName).
 658            None: If the projectId cannot be found on Synapse.
 659        """
 660
 661        # select all folders and fetch their names from within the storage project;
 662        # if folder content type is defined, only select folders that contain datasets
 663        if "contentType" in self.storageFileviewTable.columns:
 664            foldersTable = self.storageFileviewTable[
 665                (self.storageFileviewTable["contentType"] == "dataset")
 666                & (self.storageFileviewTable["projectId"] == projectId)
 667            ]
 668        else:
 669            foldersTable = self.storageFileviewTable[
 670                (self.storageFileviewTable["type"] == "folder")
 671                & (self.storageFileviewTable["parentId"] == projectId)
 672            ]
 673
 674        # get an array of tuples (folderId, folderName)
 675        # some folders are part of datasets; others contain datasets
 676        # each dataset parent is the project; folders part of a dataset have another folder as a parent
 677        # to get folders if and only if they contain datasets for each folder
 678        # check if folder's parent is the project; if so that folder contains a dataset,
 679        # unless the folder list has already been filtered to dataset folders based on contentType attribute above
 680
 681        datasetList = []
 682        folderProperties = ["id", "name"]
 683        for folder in list(
 684            foldersTable[folderProperties].itertuples(index=False, name=None)
 685        ):
 686            datasetList.append(folder)
 687
 688        sorted_dataset_list = sorted(datasetList, key=lambda tup: tup[0])
 689
 690        return sorted_dataset_list
 691
 692    @tracer.start_as_current_span("SynapseStorage::getFilesInStorageDataset")
 693    def getFilesInStorageDataset(
 694        self, datasetId: str, fileNames: List = None, fullpath: bool = True
 695    ) -> List[Tuple[str, str]]:
 696        """Gets all files (excluding manifest files) in a given dataset folder.
 697
 698        Args:
 699            datasetId: synapse ID of a storage dataset.
 700            fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g.
 701            metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present.
 702            fullpath: if True return the full path as part of this filename; otherwise return just base filename
 703
 704        Returns:
 705            A list of files; the list consists of tuples (fileId, fileName).
 706
 707        Raises:
 708            ValueError: Dataset ID not found.
 709        """
 710        file_list = []
 711
 712        # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view
 713        if self.storageFileviewTable.empty:
 714            raise ValueError(
 715                f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again."
 716            )
 717
 718        child_path = self.storageFileviewTable.loc[
 719            self.storageFileviewTable["parentId"] == datasetId, "path"
 720        ]
 721        if child_path.empty:
 722            raise LookupError(
 723                f"Dataset {datasetId} could not be found in fileview {self.storageFileview}."
 724            )
 725        child_path = child_path.iloc[0]
 726
 727        # Get the dataset path by eliminating the child's portion of the path to account for nested datasets
 728        parent = child_path.split("/")[:-1]
 729        parent = "/".join(parent)
 730
 731        # Format dataset path to be used in table query
 732        dataset_path = f"'{parent}/%'"
 733
 734        # When querying, only include files to exclude entity files and subdirectories
 735        where_clauses = [f"path like {dataset_path}", "type='file'"]
 736
 737        # Requery the fileview to specifically get the files in the given dataset
 738        self.query_fileview(columns=["id", "path"], where_clauses=where_clauses)
 739
 740        # Exclude manifest files
 741        non_manifest_files = self.storageFileviewTable.loc[
 742            ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"),
 743            :,
 744        ]
 745
 746        # Remove all files that are not in the list of fileNames
 747        if fileNames:
 748            filename_regex = "|".join(fileNames)
 749
 750            matching_files = non_manifest_files["path"].str.contains(
 751                filename_regex, case=False, regex=True
 752            )
 753
 754            non_manifest_files = non_manifest_files.loc[matching_files, :]
 755
 756        # Truncate path if necessary
 757        if not fullpath:
 758            non_manifest_files.path = non_manifest_files.path.apply(os.path.basename)
 759
 760        # Return list of files as expected by other methods
 761        file_list = list(non_manifest_files.itertuples(index=False, name=None))
 762
 763        return file_list
 764
 765    def _get_manifest_id(self, manifest: pd.DataFrame) -> str:
 766        """If both censored and uncensored manifests are present, return uncensored manifest; if only one manifest is present, return manifest id of that manifest; if more than two manifests are present, return the manifest id of the first one.
 767        Args:
 768        manifest: a dataframe contains name and id of manifests in a given asset view
 769
 770        Return:
 771        manifest_syn_id: id of a given censored or uncensored manifest
 772        """
 773        censored_regex = re.compile(".*censored.*")
 774        censored = manifest["name"].str.contains(censored_regex)
 775        if any(censored):
 776            # Try to use uncensored manifest first
 777            not_censored = ~censored
 778            if any(not_censored):
 779                manifest_syn_id = manifest[not_censored]["id"].iloc[0]
 780            # if only censored manifests are available, just use the first censored manifest
 781            else:
 782                manifest_syn_id = manifest["id"].iloc[0]
 783
 784        # otherwise, use the first (implied only) version that exists
 785        else:
 786            manifest_syn_id = manifest["id"].iloc[0]
 787
 788        return manifest_syn_id
 789
 790    @tracer.start_as_current_span("SynapseStorage::getDatasetManifest")
 791    def getDatasetManifest(
 792        self,
 793        datasetId: str,
 794        downloadFile: bool = False,
 795        newManifestName: str = "",
 796        use_temporary_folder: bool = True,
 797    ) -> Union[str, File]:
 798        """Gets the manifest associated with a given dataset.
 799
 800        Args:
 801            datasetId: synapse ID of a storage dataset.
 802            downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not.
 803            newManifestName: new name of a manifest that gets downloaded
 804            use_temporary_folder: boolean argument indicating if a temporary folder
 805                should be used to store the manifest file. This is useful when running
 806                this code as an API server where multiple requests could be made at the
 807                same time. This is set to False when the code is being used from the
 808                CLI. Defaults to True.
 809
 810        Returns:
 811            manifest_syn_id (String): Synapse ID of exisiting manifest file.
 812            manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True.
 813            "" (String): No pre-exisiting manifest in dataset.
 814        """
 815        manifest_data = ""
 816
 817        # get a list of files containing the manifest for this dataset (if any)
 818        all_files = self.storageFileviewTable
 819
 820        # construct regex based on manifest basename in the config
 821        manifest_re = re.compile(os.path.basename(self.manifest) + ".*.[tc]sv")
 822
 823        # search manifest based on given manifest basename regex above
 824        # and return a dataframe containing name and id of manifests in a given asset view
 825        manifest = all_files[
 826            (all_files["name"].str.contains(manifest_re, regex=True))
 827            & (all_files["parentId"] == datasetId)
 828        ]
 829
 830        manifest = manifest[["id", "name"]]
 831
 832        # if there is no pre-exisiting manifest in the specified dataset
 833        if manifest.empty:
 834            logger.warning(
 835                f"Could not find a manifest that fits basename {self.manifest} in asset view and dataset {datasetId}"
 836            )
 837            return ""
 838
 839        # if there is an exisiting manifest
 840        else:
 841            manifest_syn_id = self._get_manifest_id(manifest)
 842            if downloadFile:
 843                md = ManifestDownload(
 844                    self.syn,
 845                    manifest_id=manifest_syn_id,
 846                    synapse_entity_tracker=self.synapse_entity_tracker,
 847                )
 848                manifest_data = md.download_manifest(
 849                    newManifestName=newManifestName,
 850                    manifest_df=manifest,
 851                    use_temporary_folder=use_temporary_folder,
 852                )
 853                # TO DO: revisit how downstream code handle manifest_data. If the downstream code would break when manifest_data is an empty string,
 854                # then we should catch the error here without returning an empty string.
 855                if not manifest_data:
 856                    logger.debug(
 857                        f"No manifest data returned. Please check if you have successfully downloaded manifest: {manifest_syn_id}"
 858                    )
 859                return manifest_data
 860            return manifest_syn_id
 861
 862    def getDataTypeFromManifest(self, manifestId: str):
 863        """Fetch a manifest and return data types of all columns
 864        Args:
 865            manifestId: synapse ID of a manifest
 866        """
 867        # get manifest file path
 868        manifest_entity = self.synapse_entity_tracker.get(
 869            synapse_id=manifestId, syn=self.syn, download_file=True
 870        )
 871        manifest_filepath = manifest_entity.path
 872
 873        # load manifest dataframe
 874        manifest = load_df(
 875            manifest_filepath,
 876            preserve_raw_input=False,
 877            data_model=False,
 878        )
 879
 880        # convert the dataFrame to use best possible dtypes.
 881        manifest_new = manifest.convert_dtypes()
 882
 883        # get data types of columns
 884        result = manifest_new.dtypes.to_frame("dtypes").reset_index()
 885
 886        # return the result as a dictionary
 887        result_dict = result.set_index("index")["dtypes"].astype(str).to_dict()
 888
 889        return result_dict
 890
 891    def _get_files_metadata_from_dataset(
 892        self, datasetId: str, only_new_files: bool, manifest: pd.DataFrame = None
 893    ) -> Optional[dict]:
 894        """retrieve file ids under a particular datasetId
 895
 896        Args:
 897            datasetId (str): a dataset id
 898            only_new_files (bool): if only adding new files that are not already exist
 899            manifest (pd.DataFrame): metadata manifest dataframe. Default to None.
 900
 901        Returns:
 902            a dictionary that contains filename and entityid under a given datasetId or None if there is nothing under a given dataset id are not available
 903        """
 904        dataset_files = self.getFilesInStorageDataset(datasetId)
 905        if dataset_files:
 906            dataset_file_names_id_dict = self._get_file_entityIds(
 907                dataset_files, only_new_files=only_new_files, manifest=manifest
 908            )
 909            return dataset_file_names_id_dict
 910        else:
 911            return None
 912
 913    def add_entity_id_and_filename(
 914        self, datasetId: str, manifest: pd.DataFrame
 915    ) -> pd.DataFrame:
 916        """add entityid and filename column to an existing manifest assuming entityId column is not already present
 917
 918        Args:
 919            datasetId (str): dataset syn id
 920            manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty
 921
 922        Returns:
 923            pd.DataFrame: returns a pandas dataframe
 924        """
 925        # get file names and entity ids of a given dataset
 926        dataset_files_dict = self._get_files_metadata_from_dataset(
 927            datasetId, only_new_files=False
 928        )
 929
 930        if dataset_files_dict:
 931            # turn manifest dataframe back to a dictionary for operation
 932            manifest_dict = manifest.to_dict("list")
 933
 934            # update Filename column
 935            # add entityId column to the end
 936            manifest_dict.update(dataset_files_dict)
 937
 938            # if the component column exists in existing manifest, fill up that column
 939            if "Component" in manifest_dict.keys():
 940                manifest_dict["Component"] = manifest_dict["Component"] * max(
 941                    1, len(manifest_dict["Filename"])
 942                )
 943
 944            # turn dictionary back to a dataframe
 945            manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient="index")
 946            manifest_df_updated = manifest_df_index.transpose()
 947
 948            # fill na with empty string
 949            manifest_df_updated = manifest_df_updated.fillna("")
 950
 951            # drop index
 952            manifest_df_updated = manifest_df_updated.reset_index(drop=True)
 953
 954            return manifest_df_updated
 955        else:
 956            return manifest
 957
 958    def fill_in_entity_id_filename(
 959        self, datasetId: str, manifest: pd.DataFrame
 960    ) -> Tuple[List, pd.DataFrame]:
 961        """fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present.
 962
 963        Args:
 964            datasetId (str): dataset syn id
 965            manifest (pd.DataFrame): existing manifest dataframe.
 966
 967        Returns:
 968            Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe
 969        """
 970        # get dataset file names and entity id as a list of tuple
 971        dataset_files = self.getFilesInStorageDataset(datasetId)
 972
 973        # update manifest with additional filenames, if any
 974        # note that if there is an existing manifest and there are files in the dataset
 975        # the columns Filename and entityId are assumed to be present in manifest schema
 976        # TODO: use idiomatic panda syntax
 977        if not dataset_files:
 978            manifest = manifest.fillna("")
 979            return dataset_files, manifest
 980
 981        all_files = self._get_file_entityIds(
 982            dataset_files=dataset_files, only_new_files=False, manifest=manifest
 983        )
 984        new_files = self._get_file_entityIds(
 985            dataset_files=dataset_files, only_new_files=True, manifest=manifest
 986        )
 987
 988        all_files = pd.DataFrame(all_files)
 989        new_files = pd.DataFrame(new_files)
 990
 991        # update manifest so that it contains new dataset files
 992        manifest = (
 993            pd.concat([manifest, new_files], sort=False)
 994            .reset_index()
 995            .drop("index", axis=1)
 996        )
 997
 998        # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata
 999        manifest_reindex = manifest.set_index("entityId")
1000        all_files_reindex = all_files.set_index("entityId")
1001        all_files_reindex_like_manifest = all_files_reindex.reindex_like(
1002            manifest_reindex
1003        )
1004
1005        # Check if individual file paths in manifest and from synapse match
1006        file_paths_match = (
1007            manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"]
1008        )
1009
1010        # If all the paths do not match, update the manifest with the filepaths from synapse
1011        if not file_paths_match.all():
1012            manifest_reindex.loc[
1013                ~file_paths_match, "Filename"
1014            ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"]
1015
1016            # reformat manifest for further use
1017            manifest = manifest_reindex.reset_index()
1018            entityIdCol = manifest.pop("entityId")
1019            manifest.insert(len(manifest.columns), "entityId", entityIdCol)
1020
1021        manifest = manifest.fillna("")
1022        return dataset_files, manifest
1023
1024    @tracer.start_as_current_span("SynapseStorage::updateDatasetManifestFiles")
1025    def updateDatasetManifestFiles(
1026        self, dmge: DataModelGraphExplorer, datasetId: str, store: bool = True
1027    ) -> Union[Tuple[str, pd.DataFrame], None]:
1028        """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any.
1029
1030        Args:
1031            dmge: DataModelGraphExplorer Instance
1032            datasetId: synapse ID of a storage dataset.
1033            store: if set to True store updated manifest in asset store; if set to False
1034            return a Pandas dataframe containing updated manifest but do not store to asset store
1035
1036
1037        Returns:
1038            Synapse ID of updated manifest and Pandas dataframe containing the updated manifest.
1039            If there is no existing manifest or if the manifest does not have an entityId column, return None
1040        """
1041
1042        # get existing manifest Synapse ID
1043        manifest_id = self.getDatasetManifest(datasetId)
1044
1045        # if there is no manifest return None
1046        if not manifest_id:
1047            return None
1048
1049        manifest_entity = self.synapse_entity_tracker.get(
1050            synapse_id=manifest_id, syn=self.syn, download_file=True
1051        )
1052        manifest_filepath = manifest_entity.path
1053        manifest = load_df(manifest_filepath)
1054
1055        # If the manifest does not have an entityId column, trigger a new manifest to be generated
1056        if "entityId" not in manifest.columns:
1057            return None
1058
1059        manifest_is_file_based = "Filename" in manifest.columns
1060
1061        if manifest_is_file_based:
1062            # update manifest with additional filenames, if any
1063            # note that if there is an existing manifest and there are files in the dataset
1064            # the columns Filename and entityId are assumed to be present in manifest schema
1065            # TODO: use idiomatic panda syntax
1066            dataset_files, manifest = self.fill_in_entity_id_filename(
1067                datasetId, manifest
1068            )
1069            if dataset_files:
1070                # update the manifest file, so that it contains the relevant entity IDs
1071                if store:
1072                    manifest.to_csv(manifest_filepath, index=False)
1073
1074                    # store manifest and update associated metadata with manifest on Synapse
1075                    manifest_id = self.associateMetadataWithFiles(
1076                        dmge, manifest_filepath, datasetId
1077                    )
1078
1079        return manifest_id, manifest
1080
1081    def _get_file_entityIds(
1082        self,
1083        dataset_files: List,
1084        only_new_files: bool = False,
1085        manifest: pd.DataFrame = None,
1086    ):
1087        """
1088        Get a dictionary of files in a dataset. Either files that are not in the current manifest or all files
1089
1090        Args:
1091            manifest: metadata manifest
1092            dataset_file: List of all files in a dataset
1093            only_new_files: boolean to control whether only new files are returned or all files in the dataset
1094        Returns:
1095            files: dictionary of file names and entityIDs, with scope as specified by `only_new_files`
1096        """
1097        files = {"Filename": [], "entityId": []}
1098
1099        if only_new_files:
1100            if manifest is None:
1101                raise UnboundLocalError(
1102                    "No manifest was passed in, a manifest is required when `only_new_files` is True."
1103                )
1104
1105            if "entityId" not in manifest.columns:
1106                raise ValueError(
1107                    "The manifest in your dataset and/or top level folder must contain the 'entityId' column. "
1108                    "Please generate an empty manifest without annotations, manually add annotations to the "
1109                    "appropriate files in the manifest, and then try again."
1110                )
1111
1112            # find new files (that are not in the current manifest) if any
1113            for file_id, file_name in dataset_files:
1114                if not file_id in manifest["entityId"].values:
1115                    files["Filename"].append(file_name)
1116                    files["entityId"].append(file_id)
1117        else:
1118            # get all files
1119            for file_id, file_name in dataset_files:
1120                files["Filename"].append(file_name)
1121                files["entityId"].append(file_id)
1122
1123        return files
1124
1125    @tracer.start_as_current_span("SynapseStorage::getProjectManifests")
1126    def getProjectManifests(
1127        self, projectId: str
1128    ) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]:
1129        """Gets all metadata manifest files across all datasets in a specified project.
1130
1131        Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest
1132                 as a list of tuples, one for each manifest:
1133                    [
1134                        (
1135                            (datasetId, dataName),
1136                            (manifestId, manifestName),
1137                            (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema
1138                        ),
1139                        ...
1140                    ]
1141
1142        TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface
1143        """
1144        component = None
1145        entity = None
1146        manifests = []
1147
1148        datasets = self.getStorageDatasetsInProject(projectId)
1149
1150        for datasetId, datasetName in datasets:
1151            # encode information about the manifest in a simple list (so that R clients can unpack it)
1152            # eventually can serialize differently
1153
1154            # Get synID of manifest for a dataset
1155            manifestId = self.getDatasetManifest(datasetId)
1156
1157            # If a manifest exists, get the annotations for it, else return base 'manifest' tuple
1158            if manifestId:
1159                annotations = self.getFileAnnotations(manifestId)
1160
1161                # If manifest has annotations specifying component, use that
1162                if annotations and "Component" in annotations:
1163                    component = annotations["Component"]
1164                    entity = self.synapse_entity_tracker.get(
1165                        synapse_id=manifestId, syn=self.syn, download_file=False
1166                    )
1167                    manifest_name = entity["properties"]["name"]
1168
1169                # otherwise download the manifest and parse for information
1170                elif not annotations or "Component" not in annotations:
1171                    logging.debug(
1172                        f"No component annotations have been found for manifest {manifestId}. "
1173                        "The manifest will be downloaded and parsed instead. "
1174                        "For increased speed, add component annotations to manifest."
1175                    )
1176
1177                    manifest_info = self.getDatasetManifest(
1178                        datasetId, downloadFile=True
1179                    )
1180                    manifest_name = manifest_info["properties"].get("name", "")
1181
1182                    if not manifest_name:
1183                        logger.error(f"Failed to download manifests from {datasetId}")
1184
1185                    manifest_path = manifest_info["path"]
1186
1187                    manifest_df = load_df(manifest_path)
1188
1189                    # Get component from component column if it exists
1190                    if (
1191                        "Component" in manifest_df
1192                        and not manifest_df["Component"].empty
1193                    ):
1194                        list(set(manifest_df["Component"]))
1195                        component = list(set(manifest_df["Component"]))
1196
1197                        # Added to address issues raised during DCA testing
1198                        if "" in component:
1199                            component.remove("")
1200
1201                        if len(component) == 1:
1202                            component = component[0]
1203                        elif len(component) > 1:
1204                            logging.warning(
1205                                f"Manifest {manifestId} is composed of multiple components. Schematic does not support mulit-component manifests at this time."
1206                                "Behavior of manifests with multiple components is undefined"
1207                            )
1208            else:
1209                manifest_name = ""
1210                component = None
1211            if component:
1212                manifest = (
1213                    (datasetId, datasetName),
1214                    (manifestId, manifest_name),
1215                    (component, component),
1216                )
1217            elif manifestId:
1218                logging.debug(
1219                    f"Manifest {manifestId} does not have an associated Component"
1220                )
1221                manifest = (
1222                    (datasetId, datasetName),
1223                    (manifestId, manifest_name),
1224                    ("", ""),
1225                )
1226            else:
1227                manifest = (
1228                    (datasetId, datasetName),
1229                    ("", ""),
1230                    ("", ""),
1231                )
1232
1233            if manifest:
1234                manifests.append(manifest)
1235
1236        return manifests
1237
1238    def upload_project_manifests_to_synapse(
1239        self, dmge: DataModelGraphExplorer, projectId: str
1240    ) -> List[str]:
1241        """Upload all metadata manifest files across all datasets in a specified project as tables in Synapse.
1242
1243        Returns: String of all the manifest_table_ids of all the manifests that have been loaded.
1244        """
1245
1246        manifests = []
1247        manifest_loaded = []
1248        datasets = self.getStorageDatasetsInProject(projectId)
1249
1250        for datasetId, datasetName in datasets:
1251            # encode information about the manifest in a simple list (so that R clients can unpack it)
1252            # eventually can serialize differently
1253
1254            manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1255
1256            manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1257            if manifest_info:
1258                manifest_id = manifest_info["properties"]["id"]
1259                manifest_name = manifest_info["properties"]["name"]
1260                manifest_path = manifest_info["path"]
1261                manifest_df = load_df(manifest_path)
1262                manifest_table_id = uploadDB(
1263                    dmge=dmge,
1264                    manifest=manifest,
1265                    datasetId=datasetId,
1266                    table_name=datasetName,
1267                )
1268                manifest_loaded.append(datasetName)
1269        return manifest_loaded
1270
1271    def upload_annotated_project_manifests_to_synapse(
1272        self, projectId: str, path_to_json_ld: str, dry_run: bool = False
1273    ) -> List[str]:
1274        """
1275        Purpose:
1276            For all manifests in a project, upload them as a table and add annotations manifest csv.
1277            Assumes the manifest is already present as a CSV in a dataset in the project.
1278
1279        """
1280        # Instantiate DataModelParser
1281        data_model_parser = DataModelParser(path_to_data_model=path_to_json_ld)
1282        # Parse Model
1283        parsed_data_model = data_model_parser.parse_model()
1284
1285        # Instantiate DataModelGraph
1286        data_model_grapher = DataModelGraph(parsed_data_model)
1287
1288        # Generate graph
1289        graph_data_model = data_model_grapher.generate_data_model_graph()
1290
1291        # Instantiate DataModelGraphExplorer
1292        dmge = DataModelGraphExplorer(graph_data_model)
1293
1294        manifests = []
1295        manifest_loaded = []
1296        datasets = self.getStorageDatasetsInProject(projectId)
1297        for datasetId, datasetName in datasets:
1298            # encode information about the manifest in a simple list (so that R clients can unpack it)
1299            # eventually can serialize differently
1300
1301            manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1302            manifests.append(manifest)
1303
1304            manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1305
1306            if manifest_info:
1307                manifest_id = manifest_info["properties"]["id"]
1308                manifest_name = manifest_info["properties"]["name"]
1309                manifest_path = manifest_info["path"]
1310                manifest = (
1311                    (datasetId, datasetName),
1312                    (manifest_id, manifest_name),
1313                    ("", ""),
1314                )
1315                if not dry_run:
1316                    self.associateMetadataWithFiles(
1317                        dmge, manifest_path, datasetId, manifest_record_type="table"
1318                    )
1319                manifest_loaded.append(manifest)
1320
1321        return manifests, manifest_loaded
1322
1323    def move_entities_to_new_project(
1324        self,
1325        projectId: str,
1326        newProjectId: str,
1327        returnEntities: bool = False,
1328        dry_run: bool = False,
1329    ):
1330        """
1331        For each manifest csv in a project, look for all the entitiy ids that are associated.
1332        Look up the entitiy in the files, move the entity to new project.
1333        """
1334
1335        manifests = []
1336        manifest_loaded = []
1337        datasets = self.getStorageDatasetsInProject(projectId)
1338        if datasets:
1339            for datasetId, datasetName in datasets:
1340                # encode information about the manifest in a simple list (so that R clients can unpack it)
1341                # eventually can serialize differently
1342
1343                manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1344                manifests.append(manifest)
1345
1346                manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1347                if manifest_info:
1348                    manifest_id = manifest_info["properties"]["id"]
1349                    manifest_name = manifest_info["properties"]["name"]
1350                    manifest_path = manifest_info["path"]
1351                    manifest_df = load_df(manifest_path)
1352
1353                    manifest = (
1354                        (datasetId, datasetName),
1355                        (manifest_id, manifest_name),
1356                        ("", ""),
1357                    )
1358                    manifest_loaded.append(manifest)
1359
1360                    annotation_entities = self.storageFileviewTable[
1361                        (self.storageFileviewTable["id"].isin(manifest_df["entityId"]))
1362                        & (self.storageFileviewTable["type"] == "folder")
1363                    ]["id"]
1364
1365                    if returnEntities:
1366                        for entityId in annotation_entities:
1367                            if not dry_run:
1368                                moved_entity = self.syn.move(entityId, datasetId)
1369                                self.synapse_entity_tracker.add(
1370                                    synapse_id=moved_entity.id, entity=moved_entity
1371                                )
1372                            else:
1373                                logging.info(
1374                                    f"{entityId} will be moved to folder {datasetId}."
1375                                )
1376                    else:
1377                        # generate project folder
1378                        archive_project_folder = Folder(
1379                            projectId + "_archive", parent=newProjectId
1380                        )
1381                        archive_project_folder = self.syn.store(archive_project_folder)
1382                        self.synapse_entity_tracker.add(
1383                            synapse_id=archive_project_folder.id,
1384                            entity=archive_project_folder,
1385                        )
1386
1387                        # generate dataset folder
1388                        dataset_archive_folder = Folder(
1389                            "_".join([datasetId, datasetName, "archive"]),
1390                            parent=archive_project_folder.id,
1391                        )
1392                        dataset_archive_folder = self.syn.store(dataset_archive_folder)
1393                        self.synapse_entity_tracker.add(
1394                            synapse_id=dataset_archive_folder.id,
1395                            entity=dataset_archive_folder,
1396                        )
1397
1398                        for entityId in annotation_entities:
1399                            # move entities to folder
1400                            if not dry_run:
1401                                moved_entity = self.syn.move(
1402                                    entityId, dataset_archive_folder.id
1403                                )
1404                                self.synapse_entity_tracker.add(
1405                                    synapse_id=moved_entity.id, entity=moved_entity
1406                                )
1407                            else:
1408                                logging.info(
1409                                    f"{entityId} will be moved to folder {dataset_archive_folder.id}."
1410                                )
1411        else:
1412            raise LookupError(
1413                f"No datasets were found in the specified project: {projectId}. Re-check specified master_fileview in CONFIG and retry."
1414            )
1415        return manifests, manifest_loaded
1416
1417    @tracer.start_as_current_span("SynapseStorage::get_synapse_table")
1418    def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable]:
1419        """Download synapse table as a pd dataframe; return table schema and etags as results too
1420
1421        Args:
1422            synapse_id: synapse ID of the table to query
1423        """
1424
1425        results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id))
1426        df = results.asDataFrame(
1427            rowIdAndVersionInIndex=False,
1428            na_values=STR_NA_VALUES_FILTERED,
1429            keep_default_na=False,
1430        )
1431
1432        return df, results
1433
1434    @missing_entity_handler
1435    @tracer.start_as_current_span("SynapseStorage::uploadDB")
1436    def uploadDB(
1437        self,
1438        dmge: DataModelGraphExplorer,
1439        manifest: pd.DataFrame,
1440        datasetId: str,
1441        table_name: str,
1442        restrict: bool = False,
1443        table_manipulation: str = "replace",
1444        table_column_names: str = "class_label",
1445    ):
1446        """
1447        Method to upload a database to an asset store. In synapse, this will upload a metadata table
1448
1449        Args:
1450            dmge: DataModelGraphExplorer object
1451            manifest: pd.Df manifest to upload
1452            datasetId: synID of the dataset for the manifest
1453            table_name: name of the table to be uploaded
1454            restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
1455            existingTableId: str of the synId of the existing table, if one already exists
1456            table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
1457            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
1458                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
1459                display label formatting.
1460        Returns:
1461            manifest_table_id: synID of the uploaded table
1462            manifest: the original manifset
1463            table_manifest: manifest formatted appropriately for the table
1464
1465        """
1466
1467        col_schema, table_manifest = self.formatDB(
1468            dmge=dmge, manifest=manifest, table_column_names=table_column_names
1469        )
1470
1471        manifest_table_id = self.buildDB(
1472            datasetId,
1473            table_name,
1474            col_schema,
1475            table_manifest,
1476            table_manipulation,
1477            dmge,
1478            restrict,
1479        )
1480
1481        return manifest_table_id, manifest, table_manifest
1482
1483    @tracer.start_as_current_span("SynapseStorage::formatDB")
1484    def formatDB(self, dmge, manifest, table_column_names):
1485        """
1486        Method to format a manifest appropriatly for upload as table
1487
1488        Args:
1489            dmge: DataModelGraphExplorer object
1490            manifest: pd.Df manifest to upload
1491            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
1492                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
1493                display label formatting.
1494        Returns:
1495            col_schema: schema for table columns: type, size, etc
1496            table_manifest: formatted manifest
1497
1498        """
1499        # Rename the manifest columns to display names to match fileview
1500
1501        blacklist_chars = ["(", ")", ".", " ", "-"]
1502        manifest_columns = manifest.columns.tolist()
1503
1504        table_manifest = deepcopy(manifest)
1505
1506        if table_column_names == "display_name":
1507            cols = table_manifest.columns
1508
1509        elif table_column_names == "display_label":
1510            cols = [
1511                str(col).translate({ord(x): "" for x in blacklist_chars})
1512                for col in manifest_columns
1513            ]
1514
1515        elif table_column_names == "class_label":
1516            cols = [
1517                get_class_label_from_display_name(str(col)).translate(
1518                    {ord(x): "" for x in blacklist_chars}
1519                )
1520                for col in manifest_columns
1521            ]
1522        else:
1523            ValueError(
1524                f"The provided table_column_name: {table_column_names} is not valid, please resubmit with an allowed value only."
1525            )
1526
1527        cols = list(map(lambda x: x.replace("EntityId", "entityId"), cols))
1528
1529        # Reset column names in table manifest
1530        table_manifest.columns = cols
1531
1532        # move entity id to end of df
1533        entity_col = table_manifest.pop("entityId")
1534        table_manifest.insert(len(table_manifest.columns), "entityId", entity_col)
1535
1536        # Get the column schema
1537        col_schema = as_table_columns(table_manifest)
1538
1539        # Set Id column length to 64 (for some reason not being auto set.)
1540        for i, col in enumerate(col_schema):
1541            if col["name"].lower() == "id":
1542                col_schema[i]["maximumSize"] = 64
1543
1544        return col_schema, table_manifest
1545
1546    @tracer.start_as_current_span("SynapseStorage::buildDB")
1547    def buildDB(
1548        self,
1549        datasetId: str,
1550        table_name: str,
1551        col_schema: List,
1552        table_manifest: pd.DataFrame,
1553        table_manipulation: str,
1554        dmge: DataModelGraphExplorer,
1555        restrict: bool = False,
1556    ):
1557        """
1558        Method to construct the table appropriately: create new table, replace existing, or upsert new into existing
1559        Calls TableOperations class to execute
1560
1561        Args:
1562            datasetId: synID of the dataset for the manifest
1563            table_name: name of the table to be uploaded
1564            col_schema: schema for table columns: type, size, etc from `formatDB`
1565            table_manifest: formatted manifest that can be uploaded as a table
1566            table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
1567            restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
1568
1569        Returns:
1570            manifest_table_id: synID of the uploaded table
1571
1572        """
1573        table_parent_id = self.getDatasetProject(datasetId=datasetId)
1574        existing_table_id = self.syn.findEntityId(
1575            name=table_name, parent=table_parent_id
1576        )
1577
1578        tableOps = TableOperations(
1579            synStore=self,
1580            tableToLoad=table_manifest,
1581            tableName=table_name,
1582            datasetId=datasetId,
1583            existingTableId=existing_table_id,
1584            restrict=restrict,
1585            synapse_entity_tracker=self.synapse_entity_tracker,
1586        )
1587
1588        if not table_manipulation or existing_table_id is None:
1589            manifest_table_id = tableOps.createTable(
1590                columnTypeDict=col_schema,
1591                specifySchema=True,
1592            )
1593        elif existing_table_id is not None:
1594            if table_manipulation.lower() == "replace":
1595                manifest_table_id = tableOps.replaceTable(
1596                    specifySchema=True,
1597                    columnTypeDict=col_schema,
1598                )
1599            elif table_manipulation.lower() == "upsert":
1600                manifest_table_id = tableOps.upsertTable(
1601                    dmge=dmge,
1602                )
1603            elif table_manipulation.lower() == "update":
1604                manifest_table_id = tableOps.updateTable()
1605
1606        if table_manipulation and table_manipulation.lower() == "upsert":
1607            table_entity = self.synapse_entity_tracker.get(
1608                synapse_id=existing_table_id or manifest_table_id,
1609                syn=self.syn,
1610                download_file=False,
1611            )
1612            annos = OldAnnotations(
1613                id=table_entity.id,
1614                etag=table_entity.etag,
1615                values=table_entity.annotations,
1616            )
1617            annos["primary_key"] = table_manifest["Component"][0] + "_id"
1618            annos = self.syn.set_annotations(annos)
1619            table_entity.etag = annos.etag
1620            table_entity.annotations = annos
1621
1622        return manifest_table_id
1623
1624    @tracer.start_as_current_span("SynapseStorage::upload_manifest_file")
1625    def upload_manifest_file(
1626        self,
1627        manifest,
1628        metadataManifestPath,
1629        datasetId,
1630        restrict_manifest,
1631        component_name="",
1632    ):
1633        # Update manifest to have the new entityId column
1634        manifest.to_csv(metadataManifestPath, index=False)
1635
1636        # store manifest to Synapse as a CSV
1637        # update file name
1638        file_name_full = metadataManifestPath.split("/")[-1]
1639        file_extension = file_name_full.split(".")[-1]
1640
1641        # Differentiate "censored" and "uncensored" manifest
1642        if "censored" in file_name_full:
1643            file_name_new = (
1644                os.path.basename(CONFIG.synapse_manifest_basename)
1645                + "_"
1646                + component_name
1647                + "_censored"
1648                + "."
1649                + file_extension
1650            )
1651        else:
1652            file_name_new = (
1653                os.path.basename(CONFIG.synapse_manifest_basename)
1654                + "_"
1655                + component_name
1656                + "."
1657                + file_extension
1658            )
1659
1660        manifest_synapse_file = None
1661        try:
1662            # Rename the file to file_name_new then revert
1663            # This is to maintain the original file name in-case other code is
1664            # expecting that the file exists with the original name
1665            original_file_path = metadataManifestPath
1666            new_file_path = os.path.join(
1667                os.path.dirname(metadataManifestPath), file_name_new
1668            )
1669            os.rename(original_file_path, new_file_path)
1670
1671            manifest_synapse_file = self._store_file_for_manifest_upload(
1672                new_file_path=new_file_path,
1673                dataset_id=datasetId,
1674                existing_file_name=file_name_full,
1675                file_name_new=file_name_new,
1676                restrict_manifest=restrict_manifest,
1677            )
1678            manifest_synapse_file_id = manifest_synapse_file.id
1679
1680        finally:
1681            # Revert the file name back to the original
1682            os.rename(new_file_path, original_file_path)
1683
1684            if manifest_synapse_file:
1685                manifest_synapse_file.path = original_file_path
1686
1687        return manifest_synapse_file_id
1688
1689    def _store_file_for_manifest_upload(
1690        self,
1691        new_file_path: str,
1692        dataset_id: str,
1693        existing_file_name: str,
1694        file_name_new: str,
1695        restrict_manifest: bool,
1696    ) -> File:
1697        """Handles a create or update of a manifest file that is going to be uploaded.
1698        If we already have a copy of the Entity in memory we will update that instance,
1699        otherwise create a new File instance to be created in Synapse. Once stored
1700        this will add the file to the `synapse_entity_tracker` for future reference.
1701
1702        Args:
1703            new_file_path (str): The path to the new manifest file
1704            dataset_id (str): The Synapse ID of the dataset the manifest is associated with
1705            existing_file_name (str): The name of the existing file
1706            file_name_new (str): The name of the new file
1707            restrict_manifest (bool): Whether the manifest should be restricted
1708
1709        Returns:
1710            File: The stored manifest file
1711        """
1712        local_tracked_file_instance = (
1713            self.synapse_entity_tracker.search_local_by_parent_and_name(
1714                name=existing_file_name, parent_id=dataset_id
1715            )
1716            or self.synapse_entity_tracker.search_local_by_parent_and_name(
1717                name=file_name_new, parent_id=dataset_id
1718            )
1719        )
1720
1721        if local_tracked_file_instance:
1722            local_tracked_file_instance.path = new_file_path
1723            local_tracked_file_instance.description = (
1724                "Manifest for dataset " + dataset_id
1725            )
1726            manifest_synapse_file = local_tracked_file_instance
1727        else:
1728            manifest_synapse_file = File(
1729                path=new_file_path,
1730                description="Manifest for dataset " + dataset_id,
1731                parent=dataset_id,
1732                name=file_name_new,
1733            )
1734
1735        manifest_synapse_file = self.syn.store(
1736            manifest_synapse_file, isRestricted=restrict_manifest
1737        )
1738
1739        self.synapse_entity_tracker.add(
1740            synapse_id=manifest_synapse_file.id, entity=manifest_synapse_file
1741        )
1742        return manifest_synapse_file
1743
1744    async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]:
1745        """get annotations asynchronously
1746
1747        Args:
1748            synapse_id (str): synapse id of the entity that the annotation belongs
1749
1750        Returns:
1751            Dict[str, Any]: The requested entity bundle matching
1752            <https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html>
1753        """
1754        return await get_entity_id_bundle2(
1755            entity_id=synapse_id,
1756            request={"includeAnnotations": True},
1757            synapse_client=self.syn,
1758        )
1759
1760    async def store_async_annotation(self, annotation_dict: dict) -> Annotations:
1761        """store annotation in an async way
1762
1763        Args:
1764            annotation_dict (dict): annotation in a dictionary format
1765
1766        Returns:
1767            Annotations: The stored annotations.
1768        """
1769        annotation_data = Annotations.from_dict(
1770            synapse_annotations=annotation_dict["annotations"]["annotations"]
1771        )
1772        annotation_class = Annotations(
1773            annotations=annotation_data,
1774            etag=annotation_dict["annotations"]["etag"],
1775            id=annotation_dict["annotations"]["id"],
1776        )
1777        annotation_storage_result = await annotation_class.store_async(
1778            synapse_client=self.syn
1779        )
1780        local_entity = self.synapse_entity_tracker.get(
1781            synapse_id=annotation_dict["annotations"]["id"],
1782            syn=self.syn,
1783            download_file=False,
1784            retrieve_if_not_present=False,
1785        )
1786        if local_entity:
1787            local_entity.etag = annotation_storage_result.etag
1788            local_entity.annotations = annotation_storage_result
1789        return annotation_storage_result
1790
1791    def process_row_annotations(
1792        self,
1793        dmge: DataModelGraphExplorer,
1794        metadata_syn: Dict[str, Any],
1795        hide_blanks: bool,
1796        csv_list_regex: str,
1797        annos: Dict[str, Any],
1798        annotation_keys: str,
1799    ) -> Dict[str, Any]:
1800        """Processes metadata annotations based on the logic below:
1801        1. Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is:
1802            An empty or whitespace-only string.
1803            A NaN value (if the annotation is a float).
1804        if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key.
1805        if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key.
1806
1807        2. If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name".
1808        Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key.
1809
1810        3. For any other conditions, assigns the original value of anno_v to the annotation key (anno_k).
1811
1812        4. Returns the updated annotations dictionary.
1813
1814        Args:
1815            dmge (DataModelGraphExplorer): data model graph explorer
1816            metadata_syn (dict): metadata used for Synapse storage
1817            hideBlanks (bool): if true, does not upload annotation keys with blank values.
1818            csv_list_regex (str): Regex to match with comma separated list
1819            annos (Dict[str, Any]): dictionary of annotation returned from synapse
1820            annotation_keys (str): display_label/class_label
1821
1822        Returns:
1823            Dict[str, Any]: annotations as a dictionary
1824
1825        ```mermaid
1826        flowchart TD
1827            A[Start] --> C{Is anno_v empty, whitespace, or NaN?}
1828            C -- Yes --> D{Is hide_blanks True?}
1829            D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing]
1830            D -- No --> F[Assign empty string to annotation key]
1831            C -- No --> G{Is anno_v a string?}
1832            G -- No --> H[Assign original value of anno_v to annotation key]
1833            G -- Yes --> I{Does anno_v match csv_list_regex?}
1834            I -- Yes --> J[Get validation rule of anno_k]
1835            J --> K{Does the validation rule contain 'list'}
1836            K -- Yes --> L[Split anno_v by commas and assign as list]
1837            I -- No --> H
1838            K -- No --> H
1839        ```
1840        """
1841        for anno_k, anno_v in metadata_syn.items():
1842            # Remove keys with nan or empty string values or string that only contains white space from dict of annotations to be uploaded
1843            # if present on current data annotation
1844            if hide_blanks and (
1845                (isinstance(anno_v, str) and anno_v.strip() == "")
1846                or (isinstance(anno_v, float) and np.isnan(anno_v))
1847            ):
1848                annos["annotations"]["annotations"].pop(anno_k) if anno_k in annos[
1849                    "annotations"
1850                ]["annotations"].keys() else annos["annotations"]["annotations"]
1851                continue
1852
1853            # Otherwise save annotation as approrpriate
1854            if isinstance(anno_v, float) and np.isnan(anno_v):
1855                annos["annotations"]["annotations"][anno_k] = ""
1856                continue
1857
1858            # Handle strings that match the csv_list_regex and pass the validation rule
1859            if isinstance(anno_v, str) and re.fullmatch(csv_list_regex, anno_v):
1860                # Use a dictionary to dynamically choose the argument
1861                param = (
1862                    {"node_display_name": anno_k}
1863                    if annotation_keys == "display_label"
1864                    else {"node_label": anno_k}
1865                )
1866                node_validation_rules = dmge.get_node_validation_rules(**param)
1867
1868                if rule_in_rule_list("list", node_validation_rules):
1869                    annos["annotations"]["annotations"][anno_k] = anno_v.split(",")
1870                    continue
1871            # default: assign the original value
1872            annos["annotations"]["annotations"][anno_k] = anno_v
1873
1874        return annos
1875
1876    @async_missing_entity_handler
1877    async def format_row_annotations(
1878        self,
1879        dmge: DataModelGraphExplorer,
1880        row: pd.Series,
1881        entityId: str,
1882        hideBlanks: bool,
1883        annotation_keys: str,
1884    ) -> Union[None, Dict[str, Any]]:
1885        """Format row annotations
1886
1887        Args:
1888            dmge (DataModelGraphExplorer): data moodel graph explorer object
1889            row (pd.Series): row of the manifest
1890            entityId (str): entity id of the manifest
1891            hideBlanks (bool): when true, does not upload annotation keys with blank values. When false, upload Annotation keys with empty string values
1892            annotation_keys (str): display_label/class_label
1893
1894        Returns:
1895            Union[None, Dict[str,]]: if entity id is in trash can, return None. Otherwise, return the annotations
1896        """
1897        # prepare metadata for Synapse storage (resolve display name into a name that Synapse annotations support (e.g no spaces, parenthesis)
1898        # note: the removal of special characters, will apply only to annotation keys; we are not altering the manifest
1899        # this could create a divergence between manifest column and annotations. this should be ok for most use cases.
1900        # columns with special characters are outside of the schema
1901        metadataSyn = {}
1902        blacklist_chars = ["(", ")", ".", " ", "-"]
1903
1904        for k, v in row.to_dict().items():
1905            if annotation_keys == "display_label":
1906                keySyn = str(k).translate({ord(x): "" for x in blacklist_chars})
1907            elif annotation_keys == "class_label":
1908                keySyn = get_class_label_from_display_name(str(k)).translate(
1909                    {ord(x): "" for x in blacklist_chars}
1910                )
1911
1912            # Skip `Filename` and `ETag` columns when setting annotations
1913            if keySyn in ["Filename", "ETag", "eTag"]:
1914                continue
1915
1916            # truncate annotation values to 500 characters if the
1917            # size of values is greater than equal to 500 characters
1918            # add an explicit [truncatedByDataCuratorApp] message at the end
1919            # of every truncated message to indicate that the cell value
1920            # has been truncated
1921            if isinstance(v, str) and len(v) >= 500:
1922                v = v[0:472] + "[truncatedByDataCuratorApp]"
1923
1924            metadataSyn[keySyn] = v
1925
1926        # This will first check if the entity is already in memory, and if so, that
1927        # instance is used. Unfortunately, the expected return format needs to match
1928        # the Synapse API, so we need to convert the annotations to the expected format.
1929        entity = self.synapse_entity_tracker.get(
1930            synapse_id=entityId,
1931            syn=self.syn,
1932            download_file=False,
1933            retrieve_if_not_present=False,
1934        )
1935        if entity is not None:
1936            synapse_annotations = _convert_to_annotations_list(
1937                annotations=entity.annotations
1938            )
1939            annos = {
1940                "annotations": {
1941                    "id": entity.id,
1942                    "etag": entity.etag,
1943                    "annotations": synapse_annotations,
1944                }
1945            }
1946        else:
1947            annos = await self.get_async_annotation(entityId)
1948
1949        # set annotation(s) for the various objects/items in a dataset on Synapse
1950        csv_list_regex = comma_separated_list_regex()
1951
1952        annos = self.process_row_annotations(
1953            dmge=dmge,
1954            metadata_syn=metadataSyn,
1955            hide_blanks=hideBlanks,
1956            csv_list_regex=csv_list_regex,
1957            annos=annos,
1958            annotation_keys=annotation_keys,
1959        )
1960
1961        return annos
1962
1963    @missing_entity_handler
1964    @tracer.start_as_current_span("SynapseStorage::format_manifest_annotations")
1965    def format_manifest_annotations(self, manifest, manifest_synapse_id):
1966        """
1967        Set annotations for the manifest (as a whole) so they can be applied to the manifest table or csv.
1968        For now just getting the Component.
1969        """
1970
1971        entity = self.synapse_entity_tracker.get(
1972            synapse_id=manifest_synapse_id, syn=self.syn, download_file=False
1973        )
1974        is_file = entity.concreteType.endswith(".FileEntity")
1975        is_table = entity.concreteType.endswith(".TableEntity")
1976
1977        if is_file:
1978            # Get file metadata
1979            metadata = self.getFileAnnotations(manifest_synapse_id)
1980
1981            # If there is a defined component add it to the metadata.
1982            if "Component" in manifest.columns:
1983                # Gather component information
1984                component = manifest["Component"].unique()
1985
1986                # Double check that only a single component is listed, else raise an error.
1987                try:
1988                    len(component) == 1
1989                except ValueError as err:
1990                    raise ValueError(
1991                        f"Manifest has more than one component. Please check manifest and resubmit."
1992                    ) from err
1993
1994                # Add component to metadata
1995                metadata["Component"] = component[0]
1996
1997        elif is_table:
1998            # Get table metadata
1999            metadata = self.getTableAnnotations(manifest_synapse_id)
2000
2001        # Get annotations
2002        annos = OldAnnotations(
2003            id=entity.id, etag=entity.etag, values=entity.annotations
2004        )
2005
2006        # Add metadata to the annotations
2007        for annos_k, annos_v in metadata.items():
2008            annos[annos_k] = annos_v
2009
2010        return annos
2011
2012    '''
2013    def annotate_upload_manifest_table(self, manifest, datasetId, metadataManifestPath,
2014        useSchemaLabel: bool = True, hideBlanks: bool = False, restrict_manifest = False):
2015        """
2016        Purpose:
2017            Works very similarly to associateMetadataWithFiles except takes in the manifest
2018            rather than the manifest path
2019
2020        """
2021
2022        # Add uuid for table updates and fill.
2023        if not "Uuid" in manifest.columns:
2024            manifest["Uuid"] = ''
2025
2026        for idx,row in manifest.iterrows():
2027            if not row["Uuid"]:
2028                gen_uuid = uuid.uuid4()
2029                row["Uuid"] = gen_uuid
2030                manifest.loc[idx, 'Uuid'] = gen_uuid
2031
2032        # add entityId as a column if not already there or
2033        # fill any blanks with an empty string.
2034        if not "entityId" in manifest.columns:
2035            manifest["entityId"] = ""
2036        else:
2037            manifest["entityId"].fillna("", inplace=True)
2038
2039        # get a DataModelGraphExplorer object to ensure schema attribute names used in manifest are translated to schema labels for synapse annotations
2040        dmge = DataModelGraphExplorer()
2041
2042        # Create table name here.
2043        if 'Component' in manifest.columns:
2044            table_name = manifest['Component'][0].lower() + '_synapse_storage_manifest_table'
2045        else:
2046            table_name = 'synapse_storage_manifest_table'
2047
2048        # Upload manifest as a table and get the SynID and manifest
2049        manifest_synapse_table_id, manifest, table_manifest = self.upload_format_manifest_table(
2050                                                    dmge, manifest, datasetId, table_name, restrict = restrict_manifest, useSchemaLabel=useSchemaLabel,)
2051
2052        # Iterate over manifest rows, create Synapse entities and store corresponding entity IDs in manifest if needed
2053        # also set metadata for each synapse entity as Synapse annotations
2054        for idx, row in manifest.iterrows():
2055            if not row["entityId"]:
2056                # If not using entityIds, fill with manifest_table_id so
2057                row["entityId"] = manifest_synapse_table_id
2058                entityId = ''
2059            else:
2060                # get the entity id corresponding to this row
2061                entityId = row["entityId"]
2062
2063        # Load manifest to synapse as a CSV File
2064        manifest_synapse_file_id = self.upload_manifest_file(manifest, metadataManifestPath, datasetId, restrict_manifest)
2065
2066        # Get annotations for the file manifest.
2067        manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_file_id)
2068
2069        self.syn.set_annotations(manifest_annotations)
2070
2071        logger.info("Associated manifest file with dataset on Synapse.")
2072
2073        # Update manifest Synapse table with new entity id column.
2074        self.make_synapse_table(
2075            table_to_load = table_manifest,
2076            dataset_id = datasetId,
2077            existingTableId = manifest_synapse_table_id,
2078            table_name = table_name,
2079            update_col = 'Uuid',
2080            specify_schema = False,
2081            )
2082
2083        # Get annotations for the table manifest
2084        manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_table_id)
2085        self.syn.set_annotations(manifest_annotations)
2086        return manifest_synapse_table_id
2087    '''
2088
2089    def _read_manifest(self, metadataManifestPath: str) -> pd.DataFrame:
2090        """Helper function to read in provided manifest as a pandas DataFrame for subsequent downstream processing.
2091        Args:
2092            metadataManifestPath (str): path where manifest is stored
2093        Returns:
2094            manifest(pd.DataFrame): Manifest loaded as a pandas dataframe
2095        Raises:
2096            FileNotFoundError: Manifest file does not exist at provided path.
2097        """
2098        # read new manifest csv
2099        try:
2100            load_args = {
2101                "dtype": "string",
2102            }
2103            manifest = load_df(
2104                metadataManifestPath,
2105                preserve_raw_input=False,
2106                allow_na_values=False,
2107                **load_args,
2108            )
2109        except FileNotFoundError as err:
2110            raise FileNotFoundError(
2111                f"No manifest file was found at this path: {metadataManifestPath}"
2112            ) from err
2113        return manifest
2114
2115    def _add_id_columns_to_manifest(
2116        self, manifest: pd.DataFrame, dmge: DataModelGraphExplorer
2117    ):
2118        """Helper function to add id and entityId columns to the manifest if they do not already exist, Fill id values per row.
2119        Args:
2120            Manifest loaded as a pd.Dataframe
2121        Returns (pd.DataFrame):
2122            Manifest df with new Id and EntityId columns (and UUID values) if they were not already present.
2123        """
2124
2125        # Add Id for table updates and fill.
2126        if not col_in_dataframe("Id", manifest):
2127            # See if schema has `Uuid` column specified
2128            try:
2129                uuid_col_in_schema = dmge.is_class_in_schema(
2130                    "Uuid"
2131                ) or dmge.is_class_in_schema("uuid")
2132            except KeyError:
2133                uuid_col_in_schema = False
2134
2135            # Rename `Uuid` column if it wasn't specified in the schema
2136            if col_in_dataframe("Uuid", manifest) and not uuid_col_in_schema:
2137                manifest.rename(columns={"Uuid": "Id"}, inplace=True)
2138            # If no `Uuid` column exists or it is specified in the schema, create a new `Id` column
2139            else:
2140                manifest["Id"] = ""
2141
2142        # Retrieve the ID column name (id, Id and ID) are treated the same.
2143        id_col_name = [col for col in manifest.columns if col.lower() == "id"][0]
2144
2145        # Check if values have been added to the Id coulumn, if not add a UUID so value in the row is not blank.
2146        for idx, row in manifest.iterrows():
2147            if not row[id_col_name]:
2148                gen_uuid = str(uuid.uuid4())
2149                row[id_col_name] = gen_uuid
2150                manifest.loc[idx, id_col_name] = gen_uuid
2151
2152        # add entityId as a column if not already there or
2153        # fill any blanks with an empty string.
2154        if not col_in_dataframe("entityId", manifest):
2155            manifest["entityId"] = ""
2156        else:
2157            manifest["entityId"].fillna("", inplace=True)
2158
2159        return manifest
2160
2161    def _generate_table_name(self, manifest):
2162        """Helper function to generate a table name for upload to synapse.
2163
2164        Args:
2165            Manifest loaded as a pd.Dataframe
2166
2167        Returns:
2168            table_name (str): Name of the table to load
2169            component_name (str): Name of the manifest component (if applicable)
2170        """
2171        # Create table name here.
2172        if "Component" in manifest.columns:
2173            component_name = manifest["Component"][0].lower()
2174            table_name = component_name + "_synapse_storage_manifest_table"
2175        else:
2176            component_name = ""
2177            table_name = "synapse_storage_manifest_table"
2178        return table_name, component_name
2179
2180    def _create_entity_id(self, idx, row, manifest, datasetId):
2181        """Helper function to generate an entityId and add it to the appropriate row in the manifest.
2182        Args:
2183            row: current row of manifest being processed
2184            manifest (pd.DataFrame): loaded df containing user supplied data.
2185            datasetId (str): synapse ID of folder containing the dataset
2186
2187        Returns:
2188            manifest (pd.DataFrame): manifest with entityId added to the appropriate row
2189            entityId (str): Generated Entity Id.
2190
2191        """
2192        rowEntity = Folder(str(uuid.uuid4()), parent=datasetId)
2193        rowEntity = self.syn.store(rowEntity)
2194        entityId = rowEntity["id"]
2195        self.synapse_entity_tracker.add(synapse_id=entityId, entity=rowEntity)
2196        row["entityId"] = entityId
2197        manifest.loc[idx, "entityId"] = entityId
2198        return manifest, entityId
2199
2200    async def _process_store_annos(self, requests: Set[asyncio.Task]) -> None:
2201        """Process annotations and store them on synapse asynchronously
2202
2203        Args:
2204            requests (Set[asyncio.Task]): a set of tasks of formatting annotations created by format_row_annotations function in previous step
2205
2206        Raises:
2207            RuntimeError: raise a run time error if a task failed to complete
2208        """
2209        while requests:
2210            done_tasks, pending_tasks = await asyncio.wait(
2211                requests, return_when=asyncio.FIRST_COMPLETED
2212            )
2213            requests = pending_tasks
2214
2215            for completed_task in done_tasks:
2216                try:
2217                    annos = completed_task.result()
2218
2219                    if isinstance(annos, Annotations):
2220                        logger.info(f"Successfully stored annotations for {annos.id}")
2221                    else:
2222                        # store annotations if they are not None
2223                        if annos:
2224                            entity_id = annos["annotations"]["id"]
2225                            logger.info(
2226                                f"Obtained and processed annotations for {entity_id} entity"
2227                            )
2228                            requests.add(
2229                                asyncio.create_task(
2230                                    self.store_async_annotation(annotation_dict=annos)
2231                                )
2232                            )
2233                except Exception as e:
2234                    raise RuntimeError(f"failed with { repr(e) }.") from e
2235
2236    @tracer.start_as_current_span("SynapseStorage::add_annotations_to_entities_files")
2237    async def add_annotations_to_entities_files(
2238        self,
2239        dmge,
2240        manifest,
2241        manifest_record_type: str,
2242        datasetId: str,
2243        hideBlanks: bool,
2244        manifest_synapse_table_id="",
2245        annotation_keys: str = "class_label",
2246    ):
2247        """
2248        Depending on upload type add Ids to entityId row. Add anotations to connected
2249        files and folders. Despite the name of this function, it also applies to folders.
2250
2251        Args:
2252            dmge: DataModelGraphExplorer Object
2253            manifest (pd.DataFrame): loaded df containing user supplied data.
2254            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2255            datasetId (str): synapse ID of folder containing the dataset
2256            hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2257            manifest_synapse_table_id (str): Default is an empty string ''.
2258            annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display
2259                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2260                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2261        Returns:
2262            manifest (pd.DataFrame): modified to add entitiyId as appropriate
2263
2264        """
2265
2266        # Expected behavior is to annotate files if `Filename` is present and if file_annotations_upload is set to True regardless of `-mrt` setting
2267        if "filename" in [col.lower() for col in manifest.columns]:
2268            # get current list of files and store as dataframe
2269            dataset_files = self.getFilesInStorageDataset(datasetId)
2270            files_and_entityIds = self._get_file_entityIds(
2271                dataset_files=dataset_files, only_new_files=False
2272            )
2273            file_df = pd.DataFrame(files_and_entityIds)
2274
2275            # Merge dataframes to add entityIds
2276            manifest = manifest.merge(
2277                file_df, how="left", on="Filename", suffixes=["_x", None]
2278            ).drop("entityId_x", axis=1)
2279
2280        # Fill `entityId` for each row if missing and annotate entity as appropriate
2281        requests = set()
2282        for idx, row in manifest.iterrows():
2283            if not row["entityId"] and (
2284                manifest_record_type == "file_and_entities"
2285                or manifest_record_type == "table_file_and_entities"
2286            ):
2287                manifest, entityId = self._create_entity_id(
2288                    idx, row, manifest, datasetId
2289                )
2290            elif not row["entityId"] and manifest_record_type == "table_and_file":
2291                # If not using entityIds, fill with manifest_table_id so
2292                row["entityId"] = manifest_synapse_table_id
2293                manifest.loc[idx, "entityId"] = manifest_synapse_table_id
2294                entityId = ""
2295                # If the row is the manifest table, do not add annotations
2296            elif row["entityId"] == manifest_synapse_table_id:
2297                entityId = ""
2298            else:
2299                # get the file id of the file to annotate, collected in above step.
2300                entityId = row["entityId"]
2301
2302            # Adding annotations to connected files.
2303            if entityId:
2304                # Format annotations for Synapse
2305                annos_task = asyncio.create_task(
2306                    self.format_row_annotations(
2307                        dmge, row, entityId, hideBlanks, annotation_keys
2308                    )
2309                )
2310                requests.add(annos_task)
2311        await self._process_store_annos(requests)
2312        return manifest
2313
2314    @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_table")
2315    def upload_manifest_as_table(
2316        self,
2317        dmge: DataModelGraphExplorer,
2318        manifest: pd.DataFrame,
2319        metadataManifestPath: str,
2320        datasetId: str,
2321        table_name: str,
2322        component_name: str,
2323        restrict: bool,
2324        manifest_record_type: str,
2325        hideBlanks: bool,
2326        table_manipulation: str,
2327        table_column_names: str,
2328        annotation_keys: str,
2329        file_annotations_upload: bool = True,
2330    ):
2331        """Upload manifest to Synapse as a table and csv.
2332        Args:
2333            dmge: DataModelGraphExplorer object
2334            manifest (pd.DataFrame): loaded df containing user supplied data.
2335            metadataManifestPath: path to csv containing a validated metadata manifest.
2336            datasetId (str): synapse ID of folder containing the dataset
2337            table_name (str): Generated to name the table being uploaded.
2338            component_name (str): Name of the component manifest that is currently being uploaded.
2339            restrict (bool): Flag for censored data.
2340            manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2341            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2342            table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2343            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2344                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2345                display label formatting.
2346            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2347                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2348                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2349            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2350        Return:
2351            manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2352        """
2353        # Upload manifest as a table, get the ID and updated manifest.
2354        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2355            dmge=dmge,
2356            manifest=manifest,
2357            datasetId=datasetId,
2358            table_name=table_name,
2359            restrict=restrict,
2360            table_manipulation=table_manipulation,
2361            table_column_names=table_column_names,
2362        )
2363
2364        if file_annotations_upload:
2365            manifest = asyncio.run(
2366                self.add_annotations_to_entities_files(
2367                    dmge,
2368                    manifest,
2369                    manifest_record_type,
2370                    datasetId,
2371                    hideBlanks,
2372                    manifest_synapse_table_id,
2373                    annotation_keys,
2374                )
2375            )
2376        # Load manifest to synapse as a CSV File
2377        manifest_synapse_file_id = self.upload_manifest_file(
2378            manifest=manifest,
2379            metadataManifestPath=metadataManifestPath,
2380            datasetId=datasetId,
2381            restrict_manifest=restrict,
2382            component_name=component_name,
2383        )
2384
2385        # Set annotations for the file manifest.
2386        manifest_annotations = self.format_manifest_annotations(
2387            manifest=manifest, manifest_synapse_id=manifest_synapse_file_id
2388        )
2389        annos = self.syn.set_annotations(annotations=manifest_annotations)
2390        manifest_entity = self.synapse_entity_tracker.get(
2391            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2392        )
2393        manifest_entity.annotations = annos
2394        manifest_entity.etag = annos.etag
2395
2396        logger.info("Associated manifest file with dataset on Synapse.")
2397
2398        # Update manifest Synapse table with new entity id column.
2399        manifest_synapse_table_id, manifest, _ = self.uploadDB(
2400            dmge=dmge,
2401            manifest=manifest,
2402            datasetId=datasetId,
2403            table_name=table_name,
2404            restrict=restrict,
2405            table_manipulation="update",
2406            table_column_names=table_column_names,
2407        )
2408
2409        # Set annotations for the table manifest
2410        manifest_annotations = self.format_manifest_annotations(
2411            manifest=manifest, manifest_synapse_id=manifest_synapse_table_id
2412        )
2413        annotations_manifest_table = self.syn.set_annotations(
2414            annotations=manifest_annotations
2415        )
2416        manifest_table_entity = self.synapse_entity_tracker.get(
2417            synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False
2418        )
2419        manifest_table_entity.annotations = annotations_manifest_table
2420        manifest_table_entity.etag = annotations_manifest_table.etag
2421
2422        return manifest_synapse_file_id
2423
2424    @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_csv")
2425    def upload_manifest_as_csv(
2426        self,
2427        dmge,
2428        manifest,
2429        metadataManifestPath,
2430        datasetId,
2431        restrict,
2432        manifest_record_type,
2433        hideBlanks,
2434        component_name,
2435        annotation_keys: str,
2436        file_annotations_upload: bool = True,
2437    ):
2438        """Upload manifest to Synapse as a csv only.
2439        Args:
2440            dmge: DataModelGraphExplorer object
2441            manifest (pd.DataFrame): loaded df containing user supplied data.
2442            metadataManifestPath: path to csv containing a validated metadata manifest.
2443            datasetId (str): synapse ID of folder containing the dataset
2444            restrict (bool): Flag for censored data.
2445            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2446            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2447            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2448                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2449                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2450            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2451        Return:
2452            manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2453        """
2454        if file_annotations_upload:
2455            manifest = asyncio.run(
2456                self.add_annotations_to_entities_files(
2457                    dmge,
2458                    manifest,
2459                    manifest_record_type,
2460                    datasetId,
2461                    hideBlanks,
2462                    annotation_keys=annotation_keys,
2463                )
2464            )
2465
2466        # Load manifest to synapse as a CSV File
2467        manifest_synapse_file_id = self.upload_manifest_file(
2468            manifest,
2469            metadataManifestPath,
2470            datasetId,
2471            restrict,
2472            component_name=component_name,
2473        )
2474
2475        # Set annotations for the file manifest.
2476        manifest_annotations = self.format_manifest_annotations(
2477            manifest, manifest_synapse_file_id
2478        )
2479        annos = self.syn.set_annotations(manifest_annotations)
2480        manifest_entity = self.synapse_entity_tracker.get(
2481            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2482        )
2483        manifest_entity.annotations = annos
2484        manifest_entity.etag = annos.etag
2485
2486        logger.info("Associated manifest file with dataset on Synapse.")
2487
2488        return manifest_synapse_file_id
2489
2490    @tracer.start_as_current_span("SynapseStorage::upload_manifest_combo")
2491    def upload_manifest_combo(
2492        self,
2493        dmge,
2494        manifest,
2495        metadataManifestPath,
2496        datasetId,
2497        table_name,
2498        component_name,
2499        restrict,
2500        manifest_record_type,
2501        hideBlanks,
2502        table_manipulation,
2503        table_column_names: str,
2504        annotation_keys: str,
2505        file_annotations_upload: bool = True,
2506    ):
2507        """Upload manifest to Synapse as a table and CSV with entities.
2508        Args:
2509            dmge: DataModelGraphExplorer object
2510            manifest (pd.DataFrame): loaded df containing user supplied data.
2511            metadataManifestPath: path to csv containing a validated metadata manifest.
2512            datasetId (str): synapse ID of folder containing the dataset
2513            table_name (str): Generated to name the table being uploaded.
2514            component_name (str): Name of the component manifest that is currently being uploaded.
2515            restrict (bool): Flag for censored data.
2516            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2517            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2518            table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2519            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2520                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2521                display label formatting.
2522            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2523                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2524                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2525            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2526        Return:
2527            manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2528        """
2529        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2530            dmge=dmge,
2531            manifest=manifest,
2532            datasetId=datasetId,
2533            table_name=table_name,
2534            restrict=restrict,
2535            table_manipulation=table_manipulation,
2536            table_column_names=table_column_names,
2537        )
2538
2539        if file_annotations_upload:
2540            manifest = asyncio.run(
2541                self.add_annotations_to_entities_files(
2542                    dmge,
2543                    manifest,
2544                    manifest_record_type,
2545                    datasetId,
2546                    hideBlanks,
2547                    manifest_synapse_table_id,
2548                    annotation_keys=annotation_keys,
2549                )
2550            )
2551
2552        # Load manifest to synapse as a CSV File
2553        manifest_synapse_file_id = self.upload_manifest_file(
2554            manifest, metadataManifestPath, datasetId, restrict, component_name
2555        )
2556
2557        # Set annotations for the file manifest.
2558        manifest_annotations = self.format_manifest_annotations(
2559            manifest, manifest_synapse_file_id
2560        )
2561        file_manifest_annoations = self.syn.set_annotations(manifest_annotations)
2562        manifest_entity = self.synapse_entity_tracker.get(
2563            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2564        )
2565        manifest_entity.annotations = file_manifest_annoations
2566        manifest_entity.etag = file_manifest_annoations.etag
2567        logger.info("Associated manifest file with dataset on Synapse.")
2568
2569        # Update manifest Synapse table with new entity id column.
2570        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2571            dmge=dmge,
2572            manifest=manifest,
2573            datasetId=datasetId,
2574            table_name=table_name,
2575            restrict=restrict,
2576            table_manipulation="update",
2577            table_column_names=table_column_names,
2578        )
2579
2580        # Set annotations for the table manifest
2581        manifest_annotations = self.format_manifest_annotations(
2582            manifest, manifest_synapse_table_id
2583        )
2584        table_manifest_annotations = self.syn.set_annotations(manifest_annotations)
2585        manifest_entity = self.synapse_entity_tracker.get(
2586            synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False
2587        )
2588        manifest_entity.annotations = table_manifest_annotations
2589        manifest_entity.etag = table_manifest_annotations.etag
2590        return manifest_synapse_file_id
2591
2592    @tracer.start_as_current_span("SynapseStorage::associateMetadataWithFiles")
2593    def associateMetadataWithFiles(
2594        self,
2595        dmge: DataModelGraphExplorer,
2596        metadataManifestPath: str,
2597        datasetId: str,
2598        manifest_record_type: str = "table_file_and_entities",
2599        hideBlanks: bool = False,
2600        restrict_manifest=False,
2601        table_manipulation: str = "replace",
2602        table_column_names: str = "class_label",
2603        annotation_keys: str = "class_label",
2604        file_annotations_upload: bool = True,
2605    ) -> str:
2606        """Associate metadata with files in a storage dataset already on Synapse.
2607        Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file.
2608
2609        If this is a new manifest there could be no Synapse entities associated with the rows of this manifest
2610        this may be due to data type (e.g. clinical data) being tabular
2611        and not requiring files; to utilize uniform interfaces downstream
2612        (i.e. fileviews), a Synapse entity (a folder) is created for each row
2613        and an entity column is added to the manifest containing the resulting
2614        entity IDs; a table is also created at present as an additional interface
2615        for downstream query and interaction with the data.
2616
2617        Args:
2618            dmge: DataModelGraphExplorer Object
2619            metadataManifestPath: path to csv containing a validated metadata manifest.
2620            The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type.
2621            Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item.
2622            In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file.
2623            datasetId: synapse ID of folder containing the dataset
2624            manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination.
2625            hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2626            restrict_manifest (bool): Default is false. Flag for censored data.
2627            table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2628            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2629                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2630                display label formatting.
2631            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2632                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2633                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2634        Returns:
2635            manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2636        """
2637        # Read new manifest CSV:
2638        manifest = self._read_manifest(metadataManifestPath)
2639        manifest = self._add_id_columns_to_manifest(manifest, dmge)
2640
2641        table_name, component_name = self._generate_table_name(manifest)
2642
2643        # Upload manifest to synapse based on user input (manifest_record_type)
2644        if manifest_record_type == "file_only":
2645            manifest_synapse_file_id = self.upload_manifest_as_csv(
2646                dmge=dmge,
2647                manifest=manifest,
2648                metadataManifestPath=metadataManifestPath,
2649                datasetId=datasetId,
2650                restrict=restrict_manifest,
2651                hideBlanks=hideBlanks,
2652                manifest_record_type=manifest_record_type,
2653                component_name=component_name,
2654                annotation_keys=annotation_keys,
2655                file_annotations_upload=file_annotations_upload,
2656            )
2657        elif manifest_record_type == "table_and_file":
2658            manifest_synapse_file_id = self.upload_manifest_as_table(
2659                dmge=dmge,
2660                manifest=manifest,
2661                metadataManifestPath=metadataManifestPath,
2662                datasetId=datasetId,
2663                table_name=table_name,
2664                component_name=component_name,
2665                restrict=restrict_manifest,
2666                hideBlanks=hideBlanks,
2667                manifest_record_type=manifest_record_type,
2668                table_manipulation=table_manipulation,
2669                table_column_names=table_column_names,
2670                annotation_keys=annotation_keys,
2671                file_annotations_upload=file_annotations_upload,
2672            )
2673        elif manifest_record_type == "file_and_entities":
2674            manifest_synapse_file_id = self.upload_manifest_as_csv(
2675                dmge=dmge,
2676                manifest=manifest,
2677                metadataManifestPath=metadataManifestPath,
2678                datasetId=datasetId,
2679                restrict=restrict_manifest,
2680                hideBlanks=hideBlanks,
2681                manifest_record_type=manifest_record_type,
2682                component_name=component_name,
2683                annotation_keys=annotation_keys,
2684                file_annotations_upload=file_annotations_upload,
2685            )
2686        elif manifest_record_type == "table_file_and_entities":
2687            manifest_synapse_file_id = self.upload_manifest_combo(
2688                dmge=dmge,
2689                manifest=manifest,
2690                metadataManifestPath=metadataManifestPath,
2691                datasetId=datasetId,
2692                table_name=table_name,
2693                component_name=component_name,
2694                restrict=restrict_manifest,
2695                hideBlanks=hideBlanks,
2696                manifest_record_type=manifest_record_type,
2697                table_manipulation=table_manipulation,
2698                table_column_names=table_column_names,
2699                annotation_keys=annotation_keys,
2700                file_annotations_upload=file_annotations_upload,
2701            )
2702        else:
2703            raise ValueError("Please enter a valid manifest_record_type.")
2704        return manifest_synapse_file_id
2705
2706    def getTableAnnotations(self, table_id: str):
2707        """Generate dictionary of annotations for the given Synapse file.
2708        Synapse returns all custom annotations as lists since they
2709        can contain multiple values. In all cases, the values will
2710        be converted into strings and concatenated with ", ".
2711
2712        Args:
2713            fileId (str): Synapse ID for dataset file.
2714
2715        Returns:
2716            dict: Annotations as comma-separated strings.
2717        """
2718        try:
2719            entity = self.synapse_entity_tracker.get(
2720                synapse_id=table_id, syn=self.syn, download_file=False
2721            )
2722            is_table = entity.concreteType.endswith(".TableEntity")
2723            annotations_raw = entity.annotations
2724        except SynapseHTTPError:
2725            # If an error occurs with retrieving entity, skip it
2726            # This could be caused by a temporary file view that
2727            # was deleted since its ID was retrieved
2728            is_file, is_table = False, False
2729
2730        # Skip anything that isn't a file or folder
2731        if not (is_table):
2732            return None
2733
2734        annotations = self.getEntityAnnotations(table_id, entity, annotations_raw)
2735
2736        return annotations
2737
2738    def getFileAnnotations(self, fileId: str) -> Dict[str, str]:
2739        """Generate dictionary of annotations for the given Synapse file.
2740        Synapse returns all custom annotations as lists since they
2741        can contain multiple values. In all cases, the values will
2742        be converted into strings and concatenated with ", ".
2743
2744        Args:
2745            fileId (str): Synapse ID for dataset file.
2746
2747        Returns:
2748            dict: Annotations as comma-separated strings.
2749        """
2750
2751        # Get entity metadata, including annotations
2752        try:
2753            entity = self.synapse_entity_tracker.get(
2754                synapse_id=fileId, syn=self.syn, download_file=False
2755            )
2756            is_file = entity.concreteType.endswith(".FileEntity")
2757            is_folder = entity.concreteType.endswith(".Folder")
2758            annotations_raw = entity.annotations
2759        except SynapseHTTPError:
2760            # If an error occurs with retrieving entity, skip it
2761            # This could be caused by a temporary file view that
2762            # was deleted since its ID was retrieved
2763            is_file, is_folder = False, False
2764
2765        # Skip anything that isn't a file or folder
2766        if not (is_file or is_folder):
2767            return None
2768
2769        annotations = self.getEntityAnnotations(fileId, entity, annotations_raw)
2770
2771        return annotations
2772
2773    def getEntityAnnotations(self, fileId, entity, annotations_raw):
2774        # Extract annotations from their lists and stringify. For example:
2775        # {'YearofBirth': [1980], 'author': ['bruno', 'milen', 'sujay']}
2776        annotations = dict()
2777        for key, vals in annotations_raw.items():
2778            if isinstance(vals, list) and len(vals) == 1:
2779                annotations[key] = str(vals[0])
2780            else:
2781                annotations[key] = ", ".join(str(v) for v in vals)
2782
2783        # Add the file entity ID and eTag, which weren't lists
2784        assert fileId == entity.id, (
2785            "For some reason, the Synapse ID in the response doesn't match"
2786            "the Synapse ID sent in the request (via synapseclient)."
2787        )
2788        annotations["entityId"] = fileId
2789        annotations["eTag"] = entity.etag
2790
2791        return annotations
2792
2793    def getDatasetAnnotations(
2794        self, datasetId: str, fill_na: bool = True, force_batch: bool = False
2795    ) -> pd.DataFrame:
2796        """Generate table for annotations across all files in given dataset.
2797
2798        Args:
2799            datasetId (str): Synapse ID for dataset folder.
2800            fill_na (bool): Whether to replace missing values with
2801                blank strings.
2802            force_batch (bool): Whether to force the function to use
2803                the batch mode, which uses a file view to retrieve
2804                annotations for a given dataset. Default to False
2805                unless there are more than 50 files in the dataset.
2806
2807        Returns:
2808            pd.DataFrame: Table of annotations.
2809        """
2810        # Get all files in given dataset
2811        dataset_files = self.getFilesInStorageDataset(datasetId)
2812
2813        # if there are no dataset files, there are no annotations
2814        # return None
2815        if not dataset_files:
2816            return pd.DataFrame()
2817
2818        dataset_files_map = dict(dataset_files)
2819        dataset_file_ids, _ = list(zip(*dataset_files))
2820
2821        # Get annotations for each file from Step 1
2822        # Batch mode
2823        try_batch = len(dataset_files) >= 50 or force_batch
2824        if try_batch:
2825            try:
2826                logger.info("Trying batch mode for retrieving Synapse annotations")
2827                table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids)
2828            except (SynapseAuthenticationError, SynapseHTTPError, ValueError):
2829                logger.info(
2830                    f"Unable to create a temporary file view bound to {datasetId}. "
2831                    "Defaulting to slower iterative retrieval of annotations."
2832                )
2833                # Default to the slower non-batch method
2834                logger.info("Batch mode failed (probably due to permission error)")
2835                try_batch = False
2836
2837        # Non-batch mode
2838        if not try_batch:
2839            logger.info("Using slower (non-batch) sequential mode")
2840            records = [self.getFileAnnotations(i) for i in dataset_file_ids]
2841            # Remove any annotations for non-file/folders (stored as None)
2842            records = filter(None, records)
2843            table = pd.DataFrame.from_records(records)
2844
2845        # Add filenames for the files that "survived" annotation retrieval
2846        filenames = [dataset_files_map[i] for i in table["entityId"]]
2847
2848        if "Filename" not in table.columns:
2849            table.insert(0, "Filename", filenames)
2850
2851        # Ensure that entityId and eTag are at the end
2852        entity_ids = table.pop("entityId")
2853        etags = table.pop("eTag")
2854        table.insert(len(table.columns), "entityId", entity_ids)
2855        table.insert(len(table.columns), "eTag", etags)
2856
2857        # Missing values are filled in with empty strings for Google Sheets
2858        if fill_na:
2859            table.fillna("", inplace=True)
2860
2861        # Force all values as strings
2862        return table.astype(str)
2863
2864    def raise_final_error(retry_state):
2865        return retry_state.outcome.result()
2866
2867    def checkIfinAssetView(self, syn_id) -> str:
2868        # get data in administrative fileview for this pipeline
2869        assetViewTable = self.getStorageFileviewTable()
2870        all_files = list(assetViewTable["id"])
2871        if syn_id in all_files:
2872            return True
2873        else:
2874            return False
2875
2876    @tracer.start_as_current_span("SynapseStorage::getDatasetProject")
2877    @retry(
2878        stop=stop_after_attempt(5),
2879        wait=wait_chain(
2880            *[wait_fixed(10) for i in range(2)]
2881            + [wait_fixed(15) for i in range(2)]
2882            + [wait_fixed(20)]
2883        ),
2884        retry=retry_if_exception_type(LookupError),
2885        retry_error_callback=raise_final_error,
2886    )
2887    def getDatasetProject(self, datasetId: str) -> str:
2888        """Get parent project for a given dataset ID.
2889
2890        Args:
2891            datasetId (str): Synapse entity ID (folder or project).
2892
2893        Raises:
2894            ValueError: Raised if Synapse ID cannot be retrieved
2895            by the user or if it doesn't appear in the file view.
2896
2897        Returns:
2898            str: The Synapse ID for the parent project.
2899        """
2900
2901        # Subset main file view
2902        dataset_index = self.storageFileviewTable["id"] == datasetId
2903        dataset_row = self.storageFileviewTable[dataset_index]
2904
2905        # re-query if no datasets found
2906        if dataset_row.empty:
2907            sleep(5)
2908            self.query_fileview(force_requery=True)
2909            # Subset main file view
2910            dataset_index = self.storageFileviewTable["id"] == datasetId
2911            dataset_row = self.storageFileviewTable[dataset_index]
2912
2913        # Return `projectId` for given row if only one found
2914        if len(dataset_row) == 1:
2915            dataset_project = dataset_row["projectId"].values[0]
2916            return dataset_project
2917
2918        # Otherwise, check if already project itself
2919        try:
2920            syn_object = self.synapse_entity_tracker.get(
2921                synapse_id=datasetId, syn=self.syn, download_file=False
2922            )
2923            if syn_object.properties["concreteType"].endswith("Project"):
2924                return datasetId
2925        except SynapseHTTPError:
2926            raise PermissionError(
2927                f"The given dataset ({datasetId}) isn't accessible with this "
2928                "user. This might be caused by a typo in the dataset Synapse ID."
2929            )
2930
2931        # If not, then assume dataset not in file view
2932        raise LookupError(
2933            f"The given dataset ({datasetId}) doesn't appear in the "
2934            f"configured file view ({self.storageFileview}). This might "
2935            "mean that the file view's scope needs to be updated."
2936        )
2937
2938    def getDatasetAnnotationsBatch(
2939        self, datasetId: str, dataset_file_ids: Sequence[str] = None
2940    ) -> pd.DataFrame:
2941        """Generate table for annotations across all files in given dataset.
2942        This function uses a temporary file view to generate a table
2943        instead of iteratively querying for individual entity annotations.
2944        This function is expected to run much faster than
2945        `self.getDatasetAnnotationsBatch` on large datasets.
2946
2947        Args:
2948            datasetId (str): Synapse ID for dataset folder.
2949            dataset_file_ids (Sequence[str]): List of Synapse IDs
2950                for dataset files/folders used to subset the table.
2951
2952        Returns:
2953            pd.DataFrame: Table of annotations.
2954        """
2955        # Create data frame from annotations file view
2956        with DatasetFileView(datasetId, self.syn) as fileview:
2957            table = fileview.query()
2958
2959        if dataset_file_ids:
2960            table = table.loc[table.index.intersection(dataset_file_ids)]
2961
2962        table = table.reset_index(drop=True)
2963
2964        return table
2965
2966    def _get_table_schema_by_cname(self, table_schema):
2967        # assume no duplicate column names in the table
2968        table_schema_by_cname = {}
2969
2970        for col_record in table_schema:
2971            # TODO clean up dictionary for compactness (e.g. remove redundant 'name' key)
2972            table_schema_by_cname[col_record["name"]] = col_record
2973
2974        return table_schema_by_cname
2975
2976
2977class TableOperations:
2978    """
2979    Object to hold functions for various table operations specific to the Synapse Asset Store.
2980
2981    Currently implement operations are:
2982    createTable: upload a manifest as a new table when none exist
2983    replaceTable: replace a metadata in a table from one manifest with metadata from another manifest
2984    updateTable: add a column to a table that already exists on synapse
2985
2986    Operations currently in development are:
2987    upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest
2988    """
2989
2990    def __init__(
2991        self,
2992        synStore: SynapseStorage,
2993        tableToLoad: pd.DataFrame = None,
2994        tableName: str = None,
2995        datasetId: str = None,
2996        existingTableId: str = None,
2997        restrict: bool = False,
2998        synapse_entity_tracker: SynapseEntityTracker = None,
2999    ):
3000        """
3001        Class governing table operations (creation, replacement, upserts, updates) in schematic
3002
3003        tableToLoad: manifest formatted appropriately for the table
3004        tableName: name of the table to be uploaded
3005        datasetId: synID of the dataset for the manifest
3006        existingTableId: synId of the table currently exising on synapse (if there is one)
3007        restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
3008        synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
3009
3010        """
3011        self.synStore = synStore
3012        self.tableToLoad = tableToLoad
3013        self.tableName = tableName
3014        self.datasetId = datasetId
3015        self.existingTableId = existingTableId
3016        self.restrict = restrict
3017        self.synapse_entity_tracker = synapse_entity_tracker or SynapseEntityTracker()
3018
3019    @tracer.start_as_current_span("TableOperations::createTable")
3020    def createTable(
3021        self,
3022        columnTypeDict: dict = None,
3023        specifySchema: bool = True,
3024    ):
3025        """
3026        Method to create a table from a metadata manifest and upload it to synapse
3027
3028        Args:
3029            columnTypeDict: dictionary schema for table columns: type, size, etc
3030            specifySchema: to specify a specific schema for the table format
3031
3032        Returns:
3033            table.schema.id: synID of the newly created table
3034        """
3035        datasetEntity = self.synapse_entity_tracker.get(
3036            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3037        )
3038        datasetName = datasetEntity.name
3039        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3040
3041        if not self.tableName:
3042            self.tableName = datasetName + "table"
3043        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3044        if specifySchema:
3045            if columnTypeDict == {}:
3046                logger.error("Did not provide a columnTypeDict.")
3047            # create list of columns:
3048            cols = []
3049            for col in self.tableToLoad.columns:
3050                if col in table_schema_by_cname:
3051                    col_type = table_schema_by_cname[col]["columnType"]
3052                    max_size = (
3053                        table_schema_by_cname[col]["maximumSize"]
3054                        if "maximumSize" in table_schema_by_cname[col].keys()
3055                        else 100
3056                    )
3057                    max_list_len = 250
3058                    if max_size and max_list_len:
3059                        cols.append(
3060                            Column(
3061                                name=col,
3062                                columnType=col_type,
3063                                maximumSize=max_size,
3064                                maximumListLength=max_list_len,
3065                            )
3066                        )
3067                    elif max_size:
3068                        cols.append(
3069                            Column(name=col, columnType=col_type, maximumSize=max_size)
3070                        )
3071                    else:
3072                        cols.append(Column(name=col, columnType=col_type))
3073                else:
3074                    # TODO add warning that the given col was not found and it's max size is set to 100
3075                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3076            schema = Schema(
3077                name=self.tableName, columns=cols, parent=datasetParentProject
3078            )
3079            table = Table(schema, self.tableToLoad)
3080            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3081            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3082            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3083            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3084            return table.schema.id
3085        else:
3086            # For just uploading the tables to synapse using default
3087            # column types.
3088            table = build_table(self.tableName, datasetParentProject, self.tableToLoad)
3089            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3090            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3091            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3092            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3093            return table.schema.id
3094
3095    @tracer.start_as_current_span("TableOperations::replaceTable")
3096    def replaceTable(
3097        self,
3098        specifySchema: bool = True,
3099        columnTypeDict: dict = None,
3100    ):
3101        """
3102        Method to replace an existing table on synapse with metadata from a new manifest
3103
3104        Args:
3105            specifySchema: to infer a schema for the table format
3106            columnTypeDict: dictionary schema for table columns: type, size, etc
3107
3108        Returns:
3109           existingTableId: synID of the already existing table that had its metadata replaced
3110        """
3111        datasetEntity = self.synapse_entity_tracker.get(
3112            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3113        )
3114
3115        datasetName = datasetEntity.name
3116        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3117        existing_table, existing_results = self.synStore.get_synapse_table(
3118            self.existingTableId
3119        )
3120        # remove rows
3121        self.synStore.syn.delete(existing_results)
3122        # Data changes such as removing all rows causes the eTag to change.
3123        self.synapse_entity_tracker.remove(synapse_id=self.existingTableId)
3124        # wait for row deletion to finish on synapse before getting empty table
3125        sleep(10)
3126
3127        # removes all current columns
3128        current_table = self.synapse_entity_tracker.get(
3129            synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False
3130        )
3131
3132        current_columns = self.synStore.syn.getTableColumns(current_table)
3133        for col in current_columns:
3134            current_table.removeColumn(col)
3135
3136        if not self.tableName:
3137            self.tableName = datasetName + "table"
3138
3139        # Process columns according to manifest entries
3140        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3141        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3142        if specifySchema:
3143            if columnTypeDict == {}:
3144                logger.error("Did not provide a columnTypeDict.")
3145            # create list of columns:
3146            cols = []
3147
3148            for col in self.tableToLoad.columns:
3149                if col in table_schema_by_cname:
3150                    col_type = table_schema_by_cname[col]["columnType"]
3151                    max_size = (
3152                        table_schema_by_cname[col]["maximumSize"]
3153                        if "maximumSize" in table_schema_by_cname[col].keys()
3154                        else 100
3155                    )
3156                    max_list_len = 250
3157                    if max_size and max_list_len:
3158                        cols.append(
3159                            Column(
3160                                name=col,
3161                                columnType=col_type,
3162                                maximumSize=max_size,
3163                                maximumListLength=max_list_len,
3164                            )
3165                        )
3166                    elif max_size:
3167                        cols.append(
3168                            Column(name=col, columnType=col_type, maximumSize=max_size)
3169                        )
3170                    else:
3171                        cols.append(Column(name=col, columnType=col_type))
3172                else:
3173                    # TODO add warning that the given col was not found and it's max size is set to 100
3174                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3175
3176            # adds new columns to schema
3177            for col in cols:
3178                current_table.addColumn(col)
3179            table_result = self.synStore.syn.store(
3180                current_table, isRestricted=self.restrict
3181            )
3182            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3183            # self.synapse_entity_tracker.add(synapse_id=table_result.schema.id, entity=table_result.schema)
3184            self.synapse_entity_tracker.remove(synapse_id=table_result.id)
3185
3186            # wait for synapse store to finish
3187            sleep(1)
3188
3189            # build schema and table from columns and store with necessary restrictions
3190            schema = Schema(
3191                name=self.tableName, columns=cols, parent=datasetParentProject
3192            )
3193            schema.id = self.existingTableId
3194            table = Table(schema, self.tableToLoad, etag=existing_results.etag)
3195            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3196            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3197            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3198            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3199        else:
3200            logging.error("Must specify a schema for table replacements")
3201
3202        # remove system metadata from manifest
3203        existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True)
3204        return self.existingTableId
3205
3206    @tracer.start_as_current_span("TableOperations::_get_auth_token")
3207    def _get_auth_token(
3208        self,
3209    ):
3210        authtoken = None
3211
3212        # Get access token from environment variable if available
3213        # Primarily useful for testing environments, with other possible usefulness for containers
3214        env_access_token = os.getenv("SYNAPSE_ACCESS_TOKEN")
3215        if env_access_token:
3216            authtoken = env_access_token
3217            return authtoken
3218
3219        # Get token from authorization header
3220        # Primarily useful for API endpoint functionality
3221        if "Authorization" in self.synStore.syn.default_headers:
3222            authtoken = self.synStore.syn.default_headers["Authorization"].split(
3223                "Bearer "
3224            )[-1]
3225            return authtoken
3226
3227        # retrive credentials from synapse object
3228        # Primarily useful for local users, could only be stored here when a .synapseConfig file is used, but including to be safe
3229        synapse_object_creds = self.synStore.syn.credentials
3230        if hasattr(synapse_object_creds, "_token"):
3231            authtoken = synapse_object_creds.secret
3232
3233        # Try getting creds from .synapseConfig file if it exists
3234        # Primarily useful for local users. Seems to correlate with credentials stored in synaspe object when logged in
3235        if os.path.exists(CONFIG.synapse_configuration_path):
3236            config = get_config_file(CONFIG.synapse_configuration_path)
3237
3238            # check which credentials are provided in file
3239            if config.has_option("authentication", "authtoken"):
3240                authtoken = config.get("authentication", "authtoken")
3241
3242        # raise error if required credentials are not found
3243        if not authtoken:
3244            raise NameError(
3245                "authtoken credentials could not be found in the environment, synapse object, or the .synapseConfig file"
3246            )
3247
3248        return authtoken
3249
3250    @tracer.start_as_current_span("TableOperations::upsertTable")
3251    def upsertTable(self, dmge: DataModelGraphExplorer):
3252        """
3253        Method to upsert rows from a new manifest into an existing table on synapse
3254        For upsert functionality to work, primary keys must follow the naming convention of <componenet>_id
3255        `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality.
3256        Currently it is required to use -dl/--use_display_label with table upserts.
3257
3258
3259        Args:
3260            dmge: DataModelGraphExplorer instance
3261
3262        Returns:
3263           existingTableId: synID of the already existing table that had its metadata replaced
3264        """
3265
3266        authtoken = self._get_auth_token()
3267
3268        synapseDB = SynapseDatabase(
3269            auth_token=authtoken,
3270            project_id=self.synStore.getDatasetProject(self.datasetId),
3271            syn=self.synStore.syn,
3272            synapse_entity_tracker=self.synapse_entity_tracker,
3273        )
3274
3275        try:
3276            # Try performing upsert
3277            synapseDB.upsert_table_rows(
3278                table_name=self.tableName, data=self.tableToLoad
3279            )
3280        except SynapseHTTPError as ex:
3281            # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload
3282            if "Id is not a valid column name or id" in str(ex):
3283                self._update_table_uuid_column(dmge)
3284                synapseDB.upsert_table_rows(
3285                    table_name=self.tableName, data=self.tableToLoad
3286                )
3287            # Raise if other error
3288            else:
3289                raise ex
3290
3291        return self.existingTableId
3292
3293    @tracer.start_as_current_span("TableOperations::_update_table_uuid_column")
3294    def _update_table_uuid_column(
3295        self,
3296        dmge: DataModelGraphExplorer,
3297    ) -> None:
3298        """Removes the `Uuid` column when present, and relpaces with an `Id` column
3299        Used to enable backwards compatability for manifests using the old `Uuid` convention
3300
3301        Args:
3302            dmge: DataModelGraphExplorer instance
3303
3304        Returns:
3305            None
3306        """
3307
3308        # Get the columns of the schema
3309        schema = self.synapse_entity_tracker.get(
3310            synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False
3311        )
3312
3313        cols = self.synStore.syn.getTableColumns(schema)
3314
3315        # Iterate through columns until `Uuid` column is found
3316        for col in cols:
3317            if col.name.lower() == "uuid":
3318                # See if schema has `Uuid` column specified
3319                try:
3320                    uuid_col_in_schema = dmge.is_class_in_schema(col.name)
3321                except KeyError:
3322                    uuid_col_in_schema = False
3323
3324                # If there is, then create a new `Id` column from scratch
3325                if uuid_col_in_schema:
3326                    new_col = Column(columnType="STRING", maximumSize=64, name="Id")
3327                    schema.addColumn(new_col)
3328                    schema = self.synStore.syn.store(schema)
3329                    # self.synapse_entity_tracker.add(synapse_id=schema.id, entity=schema)
3330                    # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3331                    self.synapse_entity_tracker.remove(synapse_id=schema.id)
3332                # If there is not, then use the old `Uuid` column as a basis for the new `Id` column
3333                else:
3334                    # Build ColumnModel that will be used for new column
3335                    id_column = Column(
3336                        name="Id",
3337                        columnType="STRING",
3338                        maximumSize=64,
3339                        defaultValue=None,
3340                        maximumListLength=1,
3341                    )
3342                    new_col_response = self.synStore.syn.store(id_column)
3343
3344                    # Define columnChange body
3345                    columnChangeDict = {
3346                        "concreteType": "org.sagebionetworks.repo.model.table.TableSchemaChangeRequest",
3347                        "entityId": self.existingTableId,
3348                        "changes": [
3349                            {
3350                                "oldColumnId": col["id"],
3351                                "newColumnId": new_col_response["id"],
3352                            }
3353                        ],
3354                    }
3355
3356                    self.synStore.syn._async_table_update(
3357                        table=self.existingTableId,
3358                        changes=[columnChangeDict],
3359                        wait=False,
3360                    )
3361                break
3362
3363        return
3364
3365    @tracer.start_as_current_span("TableOperations::updateTable")
3366    def updateTable(
3367        self,
3368        update_col: str = "Id",
3369    ):
3370        """
3371        Method to update an existing table with a new column
3372
3373        Args:
3374            updateCol: column to index the old and new tables on
3375
3376        Returns:
3377           existingTableId: synID of the already existing table that had its metadata replaced
3378        """
3379        existing_table, existing_results = self.synStore.get_synapse_table(
3380            self.existingTableId
3381        )
3382
3383        self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col)
3384        # store table with existing etag data and impose restrictions as appropriate
3385        table_result = self.synStore.syn.store(
3386            Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag),
3387            isRestricted=self.restrict,
3388        )
3389        # We cannot store the Table to the `synapse_entity_tracker` because there is
3390        # not `Schema` on the table object. The above `.store()` function call would
3391        # also update the ETag of the entity within Synapse. Remove it from the tracker
3392        # and re-retrieve it later on if needed again.
3393        self.synapse_entity_tracker.remove(synapse_id=table_result.tableId)
3394
3395        return self.existingTableId
3396
3397
3398class DatasetFileView:
3399    """Helper class to create temporary dataset file views.
3400    This class can be used in conjunction with a 'with' statement.
3401    This will ensure that the file view is deleted automatically.
3402    See SynapseStorage.getDatasetAnnotationsBatch for example usage.
3403    """
3404
3405    def __init__(
3406        self,
3407        datasetId: str,
3408        synapse: Synapse,
3409        name: str = None,
3410        temporary: bool = True,
3411        parentId: str = None,
3412    ) -> None:
3413        """Create a file view scoped to a dataset folder.
3414
3415        Args:
3416            datasetId (str): Synapse ID for a dataset folder/project.
3417            synapse (Synapse): Used for Synapse requests.
3418            name (str): Name of the file view (temporary or not).
3419            temporary (bool): Whether to delete the file view on exit
3420                of either a 'with' statement or Python entirely.
3421            parentId (str, optional): Synapse ID specifying where to
3422                store the file view. Defaults to datasetId.
3423        """
3424
3425        self.datasetId = datasetId
3426        self.synapse = synapse
3427        self.is_temporary = temporary
3428
3429        if name is None:
3430            self.name = f"schematic annotation file view for {self.datasetId}"
3431
3432        if self.is_temporary:
3433            uid = secrets.token_urlsafe(5)
3434            self.name = f"{self.name} - UID {uid}"
3435
3436        # TODO: Allow a DCC admin to configure a "universal parent"
3437        #       Such as a Synapse project writeable by everyone.
3438        self.parentId = datasetId if parentId is None else parentId
3439
3440        # TODO: Create local sharing setting to hide from everyone else
3441        view_schema = EntityViewSchema(
3442            name=self.name,
3443            parent=self.parentId,
3444            scopes=self.datasetId,
3445            includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER],
3446            addDefaultViewColumns=False,
3447            addAnnotationColumns=True,
3448        )
3449
3450        # TODO: Handle failure due to insufficient permissions by
3451        #       creating a temporary new project to store view
3452        self.view_schema = self.synapse.store(view_schema)
3453
3454        # These are filled in after calling `self.query()`
3455        self.results = None
3456        self.table = None
3457
3458        # Ensure deletion of the file view (last resort)
3459        if self.is_temporary:
3460            atexit.register(self.delete)
3461
3462    def __enter__(self):
3463        """Return file view when entering 'with' statement."""
3464        return self
3465
3466    def __exit__(self, exc_type, exc_value, traceback):
3467        """Delete file view when exiting 'with' statement."""
3468        if self.is_temporary:
3469            self.delete()
3470
3471    def delete(self):
3472        """Delete the file view on Synapse without deleting local table."""
3473        if self.view_schema is not None:
3474            self.synapse.delete(self.view_schema)
3475            self.view_schema = None
3476
3477    def query(self, tidy=True, force=False):
3478        """Retrieve file view as a data frame (raw format sans index)."""
3479        if self.table is None or force:
3480            fileview_id = self.view_schema["id"]
3481            self.results = self.synapse.tableQuery(f"select * from {fileview_id}")
3482            self.table = self.results.asDataFrame(
3483                rowIdAndVersionInIndex=False,
3484                na_values=STR_NA_VALUES_FILTERED,
3485                keep_default_na=False,
3486            )
3487        if tidy:
3488            self.tidy_table()
3489        return self.table
3490
3491    def tidy_table(self):
3492        """Convert raw file view data frame into more usable format."""
3493        assert self.table is not None, "Must call `self.query()` first."
3494        self._fix_default_columns()
3495        self._fix_list_columns()
3496        self._fix_int_columns()
3497        return self.table
3498
3499    def _fix_default_columns(self):
3500        """Rename default columns to match schematic expectations."""
3501
3502        # Drop ROW_VERSION column if present
3503        if "ROW_VERSION" in self.table:
3504            del self.table["ROW_VERSION"]
3505
3506        # Rename id column to entityId and set as data frame index
3507        if "ROW_ID" in self.table:
3508            self.table["entityId"] = "syn" + self.table["ROW_ID"].astype(str)
3509            self.table = self.table.set_index("entityId", drop=False)
3510            del self.table["ROW_ID"]
3511
3512        # Rename ROW_ETAG column to eTag and place at end of data frame
3513        if "ROW_ETAG" in self.table:
3514            row_etags = self.table.pop("ROW_ETAG")
3515
3516            # eTag column may already present if users annotated data without submitting manifest
3517            # we're only concerned with the new values and not the existing ones
3518            if "eTag" in self.table:
3519                del self.table["eTag"]
3520
3521            self.table.insert(len(self.table.columns), "eTag", row_etags)
3522
3523        return self.table
3524
3525    def _get_columns_of_type(self, types):
3526        """Helper function to get list of columns of a given type(s)."""
3527        matching_columns = []
3528        for header in self.results.headers:
3529            if header.columnType in types:
3530                matching_columns.append(header.name)
3531        return matching_columns
3532
3533    def _fix_list_columns(self):
3534        """Fix formatting of list-columns."""
3535        list_types = {"STRING_LIST", "INTEGER_LIST", "BOOLEAN_LIST"}
3536        list_columns = self._get_columns_of_type(list_types)
3537        for col in list_columns:
3538            self.table[col] = self.table[col].apply(lambda x: ", ".join(x))
3539        return self.table
3540
3541    def _fix_int_columns(self):
3542        """Ensure that integer-columns are actually integers."""
3543        int_columns = self._get_columns_of_type({"INTEGER"})
3544        for col in int_columns:
3545            # Coercing to string because NaN is a floating point value
3546            # and cannot exist alongside integers in a column
3547            def to_int_fn(x):
3548                return "" if np.isnan(x) else str(int(x))
3549
3550            self.table[col] = self.table[col].apply(to_int_fn)
3551        return self.table
logger = <Logger Synapse storage (WARNING)>
tracer = <opentelemetry.trace.ProxyTracer object>
@dataclass
class ManifestDownload:
 85@dataclass
 86class ManifestDownload(object):
 87    """
 88    syn: an object of type synapseclient.
 89    manifest_id: id of a manifest
 90    synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
 91    """
 92
 93    syn: synapseclient.Synapse
 94    manifest_id: str
 95    synapse_entity_tracker: SynapseEntityTracker = field(
 96        default_factory=SynapseEntityTracker
 97    )
 98
 99    def _download_manifest_to_folder(self, use_temporary_folder: bool = True) -> File:
100        """
101        Try downloading a manifest to a specific folder (temporary or not). When the
102        `use_temporary_folder` is set to True, the manifest will be downloaded to a
103        temporary folder. This is useful for when the code is running as an API server
104        where multiple requests are being made at the same time. This will prevent
105        multiple requests from overwriting the same manifest file. When the
106        `use_temporary_folder` is set to False, the manifest will be downloaded to the
107        default manifest folder.
108
109        Args:
110            use_temporary_folder: boolean argument indicating if a temporary folder
111                should be used to store the manifest file. This is useful when running
112                this code as an API server where multiple requests could be made at the
113                same time. This is set to False when the code is being used from the
114                CLI. Defaults to True.
115
116        Return:
117            manifest_data: A Synapse file entity of the downloaded manifest
118        """
119        manifest_data = self.synapse_entity_tracker.get(
120            synapse_id=self.manifest_id,
121            syn=self.syn,
122            download_file=False,
123            retrieve_if_not_present=False,
124        )
125        current_span = trace.get_current_span()
126        if (
127            manifest_data
128            and (file_handle := manifest_data.get("_file_handle", None))
129            and current_span.is_recording()
130        ):
131            current_span.set_attribute(
132                "schematic.manifest_size", file_handle.get("contentSize", 0)
133            )
134
135        if manifest_data and manifest_data.path:
136            return manifest_data
137
138        if "SECRETS_MANAGER_SECRETS" in os.environ:
139            temporary_manifest_storage = "/var/tmp/temp_manifest_download"
140            cleanup_temporary_storage(
141                temporary_manifest_storage, time_delta_seconds=3600
142            )
143            # create a new directory to store manifest
144            if not os.path.exists(temporary_manifest_storage):
145                os.mkdir(temporary_manifest_storage)
146            # create temporary folders for storing manifests
147            download_location = create_temp_folder(
148                path=temporary_manifest_storage,
149                prefix=f"{self.manifest_id}-{time.time()}-",
150            )
151        else:
152            if use_temporary_folder:
153                download_location = create_temp_folder(
154                    path=CONFIG.manifest_folder,
155                    prefix=f"{self.manifest_id}-{time.time()}-",
156                )
157            else:
158                download_location = CONFIG.manifest_folder
159
160        manifest_data = self.synapse_entity_tracker.get(
161            synapse_id=self.manifest_id,
162            syn=self.syn,
163            download_file=True,
164            retrieve_if_not_present=True,
165            download_location=download_location,
166        )
167
168        # This is doing a rename of the downloaded file. The reason this is important
169        # is that if we are re-using a file that was previously downloaded, but the
170        # file had been renamed. The file downloaded from the Synapse client is just
171        # a direct copy of that renamed file. This code will set the name of the file
172        # to the original name that was used to download the file. Note: An MD5 checksum
173        # of the file will still be performed so if the file has changed, it will be
174        # downloaded again.
175        filename = manifest_data._file_handle.fileName
176        if filename != os.path.basename(manifest_data.path):
177            parent_folder = os.path.dirname(manifest_data.path)
178            manifest_original_name_and_path = os.path.join(parent_folder, filename)
179
180            self.syn.cache.remove(
181                file_handle_id=manifest_data.dataFileHandleId, path=manifest_data.path
182            )
183            os.rename(manifest_data.path, manifest_original_name_and_path)
184            manifest_data.path = manifest_original_name_and_path
185            self.syn.cache.add(
186                file_handle_id=manifest_data.dataFileHandleId,
187                path=manifest_original_name_and_path,
188                md5=manifest_data._file_handle.contentMd5,
189            )
190
191        return manifest_data
192
193    def _entity_type_checking(self) -> str:
194        """
195        check the entity type of the id that needs to be downloaded
196        Return:
197             if the entity type is wrong, raise an error
198        """
199        # check the type of entity
200        entity_type = entity_type_mapping(
201            syn=self.syn,
202            entity_id=self.manifest_id,
203            synapse_entity_tracker=self.synapse_entity_tracker,
204        )
205        if entity_type != "file":
206            logger.error(
207                f"You are using entity type: {entity_type}. Please provide a file ID"
208            )
209
210    def download_manifest(
211        self,
212        newManifestName: str = "",
213        manifest_df: pd.DataFrame = pd.DataFrame(),
214        use_temporary_folder: bool = True,
215    ) -> Union[str, File]:
216        """
217        Download a manifest based on a given manifest id.
218        Args:
219            newManifestName(optional): new name of a manifest that gets downloaded.
220            manifest_df(optional): a dataframe containing name and id of manifests in a given asset view
221        Return:
222            manifest_data: synapse entity file object
223        """
224
225        # enables retrying if user does not have access to uncensored manifest
226        # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location
227        manifest_data = ""
228
229        # check entity type
230        self._entity_type_checking()
231
232        # download a manifest
233        try:
234            manifest_data = self._download_manifest_to_folder(
235                use_temporary_folder=use_temporary_folder
236            )
237        except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError):
238            # if there's an error getting an uncensored manifest, try getting the censored manifest
239            if not manifest_df.empty:
240                censored_regex = re.compile(".*censored.*")
241                censored = manifest_df["name"].str.contains(censored_regex)
242                new_manifest_id = manifest_df[censored]["id"][0]
243                self.manifest_id = new_manifest_id
244                try:
245                    manifest_data = self._download_manifest_to_folder(
246                        use_temporary_folder=use_temporary_folder
247                    )
248                except (
249                    SynapseUnmetAccessRestrictions,
250                    SynapseAuthenticationError,
251                ) as e:
252                    raise PermissionError(
253                        "You don't have access to censored and uncensored manifests in this dataset."
254                    ) from e
255            else:
256                logger.error(
257                    f"You don't have access to the requested resource: {self.manifest_id}"
258                )
259
260        if newManifestName and os.path.exists(manifest_data.get("path")):
261            # Rename the file we just made to the new name
262            new_manifest_filename = newManifestName + ".csv"
263
264            # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest.
265            parent_folder = os.path.dirname(manifest_data.get("path"))
266
267            new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename)
268
269            # Copy file to new location. The purpose of using a copy instead of a rename
270            # is to avoid any potential issues with the file being used in another
271            # process. This avoids any potential race or code cocurrency conditions.
272            shutil.copyfile(src=manifest_data["path"], dst=new_manifest_path_name)
273
274            # Adding this to cache will allow us to re-use the already downloaded
275            # manifest file for up to 1 hour.
276            self.syn.cache.add(
277                file_handle_id=manifest_data.dataFileHandleId,
278                path=new_manifest_path_name,
279                md5=manifest_data._file_handle.contentMd5,
280            )
281
282            # Update file names/paths in manifest_data
283            manifest_data["name"] = new_manifest_filename
284            manifest_data["filename"] = new_manifest_filename
285            manifest_data["path"] = new_manifest_path_name
286
287        return manifest_data

syn: an object of type synapseclient. manifest_id: id of a manifest synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities

ManifestDownload( syn: synapseclient.client.Synapse, manifest_id: str, synapse_entity_tracker: schematic.store.synapse_tracker.SynapseEntityTracker = <factory>)
syn: synapseclient.client.Synapse
manifest_id: str
def download_manifest( self, newManifestName: str = '', manifest_df: pandas.core.frame.DataFrame = Empty DataFrame Columns: [] Index: [], use_temporary_folder: bool = True) -> Union[str, synapseclient.entity.File]:
210    def download_manifest(
211        self,
212        newManifestName: str = "",
213        manifest_df: pd.DataFrame = pd.DataFrame(),
214        use_temporary_folder: bool = True,
215    ) -> Union[str, File]:
216        """
217        Download a manifest based on a given manifest id.
218        Args:
219            newManifestName(optional): new name of a manifest that gets downloaded.
220            manifest_df(optional): a dataframe containing name and id of manifests in a given asset view
221        Return:
222            manifest_data: synapse entity file object
223        """
224
225        # enables retrying if user does not have access to uncensored manifest
226        # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location
227        manifest_data = ""
228
229        # check entity type
230        self._entity_type_checking()
231
232        # download a manifest
233        try:
234            manifest_data = self._download_manifest_to_folder(
235                use_temporary_folder=use_temporary_folder
236            )
237        except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError):
238            # if there's an error getting an uncensored manifest, try getting the censored manifest
239            if not manifest_df.empty:
240                censored_regex = re.compile(".*censored.*")
241                censored = manifest_df["name"].str.contains(censored_regex)
242                new_manifest_id = manifest_df[censored]["id"][0]
243                self.manifest_id = new_manifest_id
244                try:
245                    manifest_data = self._download_manifest_to_folder(
246                        use_temporary_folder=use_temporary_folder
247                    )
248                except (
249                    SynapseUnmetAccessRestrictions,
250                    SynapseAuthenticationError,
251                ) as e:
252                    raise PermissionError(
253                        "You don't have access to censored and uncensored manifests in this dataset."
254                    ) from e
255            else:
256                logger.error(
257                    f"You don't have access to the requested resource: {self.manifest_id}"
258                )
259
260        if newManifestName and os.path.exists(manifest_data.get("path")):
261            # Rename the file we just made to the new name
262            new_manifest_filename = newManifestName + ".csv"
263
264            # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest.
265            parent_folder = os.path.dirname(manifest_data.get("path"))
266
267            new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename)
268
269            # Copy file to new location. The purpose of using a copy instead of a rename
270            # is to avoid any potential issues with the file being used in another
271            # process. This avoids any potential race or code cocurrency conditions.
272            shutil.copyfile(src=manifest_data["path"], dst=new_manifest_path_name)
273
274            # Adding this to cache will allow us to re-use the already downloaded
275            # manifest file for up to 1 hour.
276            self.syn.cache.add(
277                file_handle_id=manifest_data.dataFileHandleId,
278                path=new_manifest_path_name,
279                md5=manifest_data._file_handle.contentMd5,
280            )
281
282            # Update file names/paths in manifest_data
283            manifest_data["name"] = new_manifest_filename
284            manifest_data["filename"] = new_manifest_filename
285            manifest_data["path"] = new_manifest_path_name
286
287        return manifest_data

Download a manifest based on a given manifest id.

Arguments:
  • newManifestName(optional): new name of a manifest that gets downloaded.
  • manifest_df(optional): a dataframe containing name and id of manifests in a given asset view
Return:

manifest_data: synapse entity file object

class SynapseStorage(schematic.store.base.BaseStorage):
 290class SynapseStorage(BaseStorage):
 291    """Implementation of Storage interface for datasets/files stored on Synapse.
 292    Provides utilities to list files in a specific project; update files annotations, create fileviews, etc.
 293
 294    TODO: Need to define the interface and rename and/or refactor some of the methods below.
 295    """
 296
 297    @tracer.start_as_current_span("SynapseStorage::__init__")
 298    def __init__(
 299        self,
 300        token: Optional[str] = None,  # optional parameter retrieved from browser cookie
 301        access_token: Optional[str] = None,
 302        project_scope: Optional[list] = None,
 303        synapse_cache_path: Optional[str] = None,
 304        perform_query: Optional[bool] = True,
 305        columns: Optional[list] = None,
 306        where_clauses: Optional[list] = None,
 307    ) -> None:
 308        """Initializes a SynapseStorage object.
 309
 310        Args:
 311            token (Optional[str], optional):
 312              Optional token parameter as found in browser cookie upon login to synapse.
 313              Defaults to None.
 314            access_token (Optional[list], optional):
 315              Optional access token (personal or oauth).
 316              Defaults to None.
 317            project_scope (Optional[list], optional): Defaults to None.
 318            synapse_cache_path (Optional[str], optional):
 319              Location of synapse cache.
 320              Defaults to None.
 321        TODO:
 322            Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how `query_fileview` is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands.
 323        """
 324        self.syn = self.login(synapse_cache_path, access_token)
 325        current_span = trace.get_current_span()
 326        if current_span.is_recording():
 327            current_span.set_attribute("user.id", self.syn.credentials.owner_id)
 328        self.project_scope = project_scope
 329        self.storageFileview = CONFIG.synapse_master_fileview_id
 330        self.manifest = CONFIG.synapse_manifest_basename
 331        self.root_synapse_cache = self.syn.cache.cache_root_dir
 332        self.synapse_entity_tracker = SynapseEntityTracker()
 333        if perform_query:
 334            self.query_fileview(columns=columns, where_clauses=where_clauses)
 335
 336    # TODO: When moving this over to a regular cron-job the following logic should be
 337    # out of `manifest_download`:
 338    # if "SECRETS_MANAGER_SECRETS" in os.environ:
 339    #     temporary_manifest_storage = "/var/tmp/temp_manifest_download"
 340    #     cleanup_temporary_storage(temporary_manifest_storage, time_delta_seconds=3600)
 341    @tracer.start_as_current_span("SynapseStorage::_purge_synapse_cache")
 342    def _purge_synapse_cache(
 343        self, maximum_storage_allowed_cache_gb: int = 1, minute_buffer: int = 15
 344    ) -> None:
 345        """
 346        Purge synapse cache if it exceeds a certain size. Default to 1GB.
 347        Args:
 348            maximum_storage_allowed_cache_gb (int): the maximum storage allowed
 349              before purging cache. Default is 1 GB.
 350            minute_buffer (int): All files created this amount of time or older will be deleted
 351        """
 352        # try clearing the cache
 353        # scan a directory and check size of files
 354        if os.path.exists(self.root_synapse_cache):
 355            maximum_storage_allowed_cache_bytes = maximum_storage_allowed_cache_gb * (
 356                1024**3
 357            )
 358            nbytes = get_dir_size(self.root_synapse_cache)
 359            dir_size_bytes = check_synapse_cache_size(directory=self.root_synapse_cache)
 360            # if 1 GB has already been taken, purge cache before 15 min
 361            if dir_size_bytes >= maximum_storage_allowed_cache_bytes:
 362                num_of_deleted_files = clear_synapse_cache(
 363                    self.syn.cache, minutes=minute_buffer
 364                )
 365                logger.info(
 366                    f"{num_of_deleted_files}  files have been deleted from {self.root_synapse_cache}"
 367                )
 368            else:
 369                # on AWS, OS takes around 14-17% of our ephemeral storage (20GiB)
 370                # instead of guessing how much space that we left, print out .synapseCache here
 371                logger.info(f"the total size of .synapseCache is: {nbytes} bytes")
 372
 373    @tracer.start_as_current_span("SynapseStorage::query_fileview")
 374    def query_fileview(
 375        self,
 376        columns: Optional[list] = None,
 377        where_clauses: Optional[list] = None,
 378        force_requery: Optional[bool] = False,
 379    ) -> None:
 380        """
 381        Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute.
 382        Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes.
 383        Args:
 384            columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
 385            where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
 386            force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False.
 387        """
 388        self._purge_synapse_cache()
 389
 390        # Initialize to assume that the new fileview query will be different from what may already be stored. Initializes to True because generally one will not have already been performed
 391        self.new_query_different = True
 392
 393        # If a query has already been performed, store the query
 394        previous_query_built = hasattr(self, "fileview_query")
 395        if previous_query_built:
 396            previous_query = self.fileview_query
 397
 398        # Build a query with the current given parameters and check to see if it is different from the previous
 399        self._build_query(columns=columns, where_clauses=where_clauses)
 400        if previous_query_built:
 401            self.new_query_different = self.fileview_query != previous_query
 402
 403        # Only perform the query if it is different from the previous query or we are forcing new results to be retrieved
 404        if self.new_query_different or force_requery:
 405            try:
 406                self.storageFileviewTable = self.syn.tableQuery(
 407                    query=self.fileview_query,
 408                ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
 409            except SynapseHTTPError as exc:
 410                exception_text = str(exc)
 411                if "Unknown column path" in exception_text:
 412                    raise ValueError(
 413                        "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation."
 414                    )
 415                elif "Unknown column" in exception_text:
 416                    missing_column = exception_text.split("Unknown column ")[-1]
 417                    raise ValueError(
 418                        f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview."
 419                    )
 420                else:
 421                    raise AccessCredentialsError(self.storageFileview)
 422
 423    @staticmethod
 424    def build_clause_from_dataset_id(
 425        dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None
 426    ) -> str:
 427        """
 428        Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized.
 429        Args:
 430            dataset_id: Synapse ID of a dataset that should be used to limit the query
 431            dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query
 432        Returns:
 433            clause for the query or an empty string if no dataset ID is provided
 434        """
 435        # Calling this method without specifying synIDs will complete but will not scope the view
 436        if (not dataset_id) and (not dataset_folder_list):
 437            return ""
 438
 439        # This will be used to gather files under a dataset recursively with a fileview query instead of walking
 440        if dataset_folder_list:
 441            search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list)
 442            return f"parentId IN ({search_folders})"
 443
 444        # `dataset_id` should be provided when all files are stored directly under the dataset folder
 445        return f"parentId='{dataset_id}'"
 446
 447    def _build_query(
 448        self, columns: Optional[list] = None, where_clauses: Optional[list] = None
 449    ):
 450        """
 451        Method to build a query for Synapse FileViews
 452        Args:
 453            columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
 454            where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
 455            self.storageFileview (str): Synapse FileView ID
 456            self.project_scope (Optional[list], optional): List of project IDs to be used to scope the query. Defaults to None.
 457                Gets added to where_clauses, more included for backwards compatability and as a more user friendly way of subsetting the view in a simple way.
 458        """
 459        if columns is None:
 460            columns = []
 461        if where_clauses is None:
 462            where_clauses = []
 463
 464        if self.project_scope:
 465            project_scope_clause = f"projectId IN {tuple(self.project_scope + [''])}"
 466            where_clauses.append(project_scope_clause)
 467
 468        if where_clauses:
 469            where_clauses = " AND ".join(where_clauses)
 470            where_clauses = f"WHERE {where_clauses} ;"
 471        else:
 472            where_clauses = ";"
 473
 474        if columns:
 475            columns = ",".join(columns)
 476        else:
 477            columns = "*"
 478
 479        self.fileview_query = (
 480            f"SELECT {columns} FROM {self.storageFileview} {where_clauses}"
 481        )
 482
 483        return
 484
 485    @staticmethod
 486    @tracer.start_as_current_span("SynapseStorage::login")
 487    def login(
 488        synapse_cache_path: Optional[str] = None,
 489        access_token: Optional[str] = None,
 490    ) -> synapseclient.Synapse:
 491        """Login to Synapse
 492
 493        Args:
 494            access_token (Optional[str], optional): A synapse access token. Defaults to None.
 495            synapse_cache_path (Optional[str]): location of synapse cache
 496
 497        Raises:
 498            ValueError: If unable to loging with access token
 499
 500        Returns:
 501            synapseclient.Synapse: A Synapse object that is logged in
 502        """
 503        # If no token is provided, try retrieving access token from environment
 504        if not access_token:
 505            access_token = os.getenv("SYNAPSE_ACCESS_TOKEN")
 506
 507        # login using a token
 508        if access_token:
 509            try:
 510                syn = synapseclient.Synapse(
 511                    cache_root_dir=synapse_cache_path,
 512                    debug=False,
 513                    skip_checks=True,
 514                    cache_client=False,
 515                )
 516                syn.login(authToken=access_token, silent=True)
 517                current_span = trace.get_current_span()
 518                if current_span.is_recording():
 519                    current_span.set_attribute("user.id", syn.credentials.owner_id)
 520            except SynapseHTTPError as exc:
 521                raise ValueError(
 522                    "No access to resources. Please make sure that your token is correct"
 523                ) from exc
 524        else:
 525            # login using synapse credentials provided by user in .synapseConfig (default) file
 526            syn = synapseclient.Synapse(
 527                configPath=CONFIG.synapse_configuration_path,
 528                cache_root_dir=synapse_cache_path,
 529                debug=False,
 530                skip_checks=True,
 531                cache_client=False,
 532            )
 533            syn.login(silent=True)
 534            current_span = trace.get_current_span()
 535            if current_span.is_recording():
 536                current_span.set_attribute("user.id", syn.credentials.owner_id)
 537        return syn
 538
 539    def missing_entity_handler(method):
 540        def wrapper(*args, **kwargs):
 541            try:
 542                return method(*args, **kwargs)
 543            except SynapseHTTPError as ex:
 544                str_message = str(ex).replace("\n", "")
 545                if "trash" in str_message or "does not exist" in str_message:
 546                    logging.warning(str_message)
 547                    return None
 548                else:
 549                    raise ex
 550
 551        return wrapper
 552
 553    def async_missing_entity_handler(method):
 554        """Decorator to handle missing entities in async methods."""
 555
 556        async def wrapper(*args: Any, **kwargs: Any) -> Any:
 557            try:
 558                return await method(*args, **kwargs)
 559            except SynapseHTTPError as ex:
 560                str_message = str(ex).replace("\n", "")
 561                if "trash" in str_message or "does not exist" in str_message:
 562                    logging.warning(str_message)
 563                    return None
 564                else:
 565                    raise ex
 566
 567        return wrapper
 568
 569    def getStorageFileviewTable(self):
 570        """Returns the storageFileviewTable obtained during initialization."""
 571        return self.storageFileviewTable
 572
 573    def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]:
 574        """Gets the paginated results of the REST call to Synapse to check what projects the current user has access to.
 575
 576        Args:
 577            currentUserId: synapse id for the user whose projects we want to get.
 578
 579        Returns:
 580            A dictionary with a next page token and the results.
 581        """
 582        all_results = self.syn.restGET(
 583            "/projects/user/{principalId}".format(principalId=currentUserId)
 584        )
 585
 586        while (
 587            "nextPageToken" in all_results
 588        ):  # iterate over next page token in results while there is any
 589            results_token = self.syn.restGET(
 590                "/projects/user/{principalId}?nextPageToken={nextPageToken}".format(
 591                    principalId=currentUserId,
 592                    nextPageToken=all_results["nextPageToken"],
 593                )
 594            )
 595            all_results["results"].extend(results_token["results"])
 596
 597            if "nextPageToken" in results_token:
 598                all_results["nextPageToken"] = results_token["nextPageToken"]
 599            else:
 600                del all_results["nextPageToken"]
 601
 602        return all_results
 603
 604    @tracer.start_as_current_span("SynapseStorage::getStorageProjects")
 605    def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]:
 606        """Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute.
 607
 608        Returns:
 609            A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName).
 610        """
 611
 612        # get the set of all storage Synapse project accessible for this pipeline
 613        storageProjects = self.storageFileviewTable["projectId"].unique()
 614
 615        # get the set of storage Synapse project accessible for this user
 616        # get a list of projects from Synapse
 617        current_user_project_headers = self.synapse_entity_tracker.get_project_headers(
 618            current_user_id=self.syn.credentials.owner_id, syn=self.syn
 619        )
 620        project_id_to_name_dict = {}
 621        current_user_projects = []
 622        for project_header in current_user_project_headers:
 623            project_id_to_name_dict[project_header.get("id")] = project_header.get(
 624                "name"
 625            )
 626            current_user_projects.append(project_header.get("id"))
 627
 628        # find set of user projects that are also in this pipeline's storage projects set
 629        storageProjects = list(set(storageProjects) & set(current_user_projects))
 630
 631        # Limit projects to scope if specified
 632        if project_scope:
 633            storageProjects = list(set(storageProjects) & set(project_scope))
 634
 635            if not storageProjects:
 636                raise Warning(
 637                    f"There are no projects that the user has access to that match the criteria of the specified project scope: {project_scope}"
 638                )
 639
 640        # prepare a return list of project IDs and names
 641        projects = []
 642        for projectId in storageProjects:
 643            project_name_from_project_header = project_id_to_name_dict.get(projectId)
 644            projects.append((projectId, project_name_from_project_header))
 645
 646        sorted_projects_list = sorted(projects, key=lambda tup: tup[0])
 647
 648        return sorted_projects_list
 649
 650    @tracer.start_as_current_span("SynapseStorage::getStorageDatasetsInProject")
 651    def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]:
 652        """Gets all datasets in folder under a given storage project that the current user has access to.
 653
 654        Args:
 655            projectId: synapse ID of a storage project.
 656
 657        Returns:
 658            A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName).
 659            None: If the projectId cannot be found on Synapse.
 660        """
 661
 662        # select all folders and fetch their names from within the storage project;
 663        # if folder content type is defined, only select folders that contain datasets
 664        if "contentType" in self.storageFileviewTable.columns:
 665            foldersTable = self.storageFileviewTable[
 666                (self.storageFileviewTable["contentType"] == "dataset")
 667                & (self.storageFileviewTable["projectId"] == projectId)
 668            ]
 669        else:
 670            foldersTable = self.storageFileviewTable[
 671                (self.storageFileviewTable["type"] == "folder")
 672                & (self.storageFileviewTable["parentId"] == projectId)
 673            ]
 674
 675        # get an array of tuples (folderId, folderName)
 676        # some folders are part of datasets; others contain datasets
 677        # each dataset parent is the project; folders part of a dataset have another folder as a parent
 678        # to get folders if and only if they contain datasets for each folder
 679        # check if folder's parent is the project; if so that folder contains a dataset,
 680        # unless the folder list has already been filtered to dataset folders based on contentType attribute above
 681
 682        datasetList = []
 683        folderProperties = ["id", "name"]
 684        for folder in list(
 685            foldersTable[folderProperties].itertuples(index=False, name=None)
 686        ):
 687            datasetList.append(folder)
 688
 689        sorted_dataset_list = sorted(datasetList, key=lambda tup: tup[0])
 690
 691        return sorted_dataset_list
 692
 693    @tracer.start_as_current_span("SynapseStorage::getFilesInStorageDataset")
 694    def getFilesInStorageDataset(
 695        self, datasetId: str, fileNames: List = None, fullpath: bool = True
 696    ) -> List[Tuple[str, str]]:
 697        """Gets all files (excluding manifest files) in a given dataset folder.
 698
 699        Args:
 700            datasetId: synapse ID of a storage dataset.
 701            fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g.
 702            metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present.
 703            fullpath: if True return the full path as part of this filename; otherwise return just base filename
 704
 705        Returns:
 706            A list of files; the list consists of tuples (fileId, fileName).
 707
 708        Raises:
 709            ValueError: Dataset ID not found.
 710        """
 711        file_list = []
 712
 713        # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view
 714        if self.storageFileviewTable.empty:
 715            raise ValueError(
 716                f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again."
 717            )
 718
 719        child_path = self.storageFileviewTable.loc[
 720            self.storageFileviewTable["parentId"] == datasetId, "path"
 721        ]
 722        if child_path.empty:
 723            raise LookupError(
 724                f"Dataset {datasetId} could not be found in fileview {self.storageFileview}."
 725            )
 726        child_path = child_path.iloc[0]
 727
 728        # Get the dataset path by eliminating the child's portion of the path to account for nested datasets
 729        parent = child_path.split("/")[:-1]
 730        parent = "/".join(parent)
 731
 732        # Format dataset path to be used in table query
 733        dataset_path = f"'{parent}/%'"
 734
 735        # When querying, only include files to exclude entity files and subdirectories
 736        where_clauses = [f"path like {dataset_path}", "type='file'"]
 737
 738        # Requery the fileview to specifically get the files in the given dataset
 739        self.query_fileview(columns=["id", "path"], where_clauses=where_clauses)
 740
 741        # Exclude manifest files
 742        non_manifest_files = self.storageFileviewTable.loc[
 743            ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"),
 744            :,
 745        ]
 746
 747        # Remove all files that are not in the list of fileNames
 748        if fileNames:
 749            filename_regex = "|".join(fileNames)
 750
 751            matching_files = non_manifest_files["path"].str.contains(
 752                filename_regex, case=False, regex=True
 753            )
 754
 755            non_manifest_files = non_manifest_files.loc[matching_files, :]
 756
 757        # Truncate path if necessary
 758        if not fullpath:
 759            non_manifest_files.path = non_manifest_files.path.apply(os.path.basename)
 760
 761        # Return list of files as expected by other methods
 762        file_list = list(non_manifest_files.itertuples(index=False, name=None))
 763
 764        return file_list
 765
 766    def _get_manifest_id(self, manifest: pd.DataFrame) -> str:
 767        """If both censored and uncensored manifests are present, return uncensored manifest; if only one manifest is present, return manifest id of that manifest; if more than two manifests are present, return the manifest id of the first one.
 768        Args:
 769        manifest: a dataframe contains name and id of manifests in a given asset view
 770
 771        Return:
 772        manifest_syn_id: id of a given censored or uncensored manifest
 773        """
 774        censored_regex = re.compile(".*censored.*")
 775        censored = manifest["name"].str.contains(censored_regex)
 776        if any(censored):
 777            # Try to use uncensored manifest first
 778            not_censored = ~censored
 779            if any(not_censored):
 780                manifest_syn_id = manifest[not_censored]["id"].iloc[0]
 781            # if only censored manifests are available, just use the first censored manifest
 782            else:
 783                manifest_syn_id = manifest["id"].iloc[0]
 784
 785        # otherwise, use the first (implied only) version that exists
 786        else:
 787            manifest_syn_id = manifest["id"].iloc[0]
 788
 789        return manifest_syn_id
 790
 791    @tracer.start_as_current_span("SynapseStorage::getDatasetManifest")
 792    def getDatasetManifest(
 793        self,
 794        datasetId: str,
 795        downloadFile: bool = False,
 796        newManifestName: str = "",
 797        use_temporary_folder: bool = True,
 798    ) -> Union[str, File]:
 799        """Gets the manifest associated with a given dataset.
 800
 801        Args:
 802            datasetId: synapse ID of a storage dataset.
 803            downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not.
 804            newManifestName: new name of a manifest that gets downloaded
 805            use_temporary_folder: boolean argument indicating if a temporary folder
 806                should be used to store the manifest file. This is useful when running
 807                this code as an API server where multiple requests could be made at the
 808                same time. This is set to False when the code is being used from the
 809                CLI. Defaults to True.
 810
 811        Returns:
 812            manifest_syn_id (String): Synapse ID of exisiting manifest file.
 813            manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True.
 814            "" (String): No pre-exisiting manifest in dataset.
 815        """
 816        manifest_data = ""
 817
 818        # get a list of files containing the manifest for this dataset (if any)
 819        all_files = self.storageFileviewTable
 820
 821        # construct regex based on manifest basename in the config
 822        manifest_re = re.compile(os.path.basename(self.manifest) + ".*.[tc]sv")
 823
 824        # search manifest based on given manifest basename regex above
 825        # and return a dataframe containing name and id of manifests in a given asset view
 826        manifest = all_files[
 827            (all_files["name"].str.contains(manifest_re, regex=True))
 828            & (all_files["parentId"] == datasetId)
 829        ]
 830
 831        manifest = manifest[["id", "name"]]
 832
 833        # if there is no pre-exisiting manifest in the specified dataset
 834        if manifest.empty:
 835            logger.warning(
 836                f"Could not find a manifest that fits basename {self.manifest} in asset view and dataset {datasetId}"
 837            )
 838            return ""
 839
 840        # if there is an exisiting manifest
 841        else:
 842            manifest_syn_id = self._get_manifest_id(manifest)
 843            if downloadFile:
 844                md = ManifestDownload(
 845                    self.syn,
 846                    manifest_id=manifest_syn_id,
 847                    synapse_entity_tracker=self.synapse_entity_tracker,
 848                )
 849                manifest_data = md.download_manifest(
 850                    newManifestName=newManifestName,
 851                    manifest_df=manifest,
 852                    use_temporary_folder=use_temporary_folder,
 853                )
 854                # TO DO: revisit how downstream code handle manifest_data. If the downstream code would break when manifest_data is an empty string,
 855                # then we should catch the error here without returning an empty string.
 856                if not manifest_data:
 857                    logger.debug(
 858                        f"No manifest data returned. Please check if you have successfully downloaded manifest: {manifest_syn_id}"
 859                    )
 860                return manifest_data
 861            return manifest_syn_id
 862
 863    def getDataTypeFromManifest(self, manifestId: str):
 864        """Fetch a manifest and return data types of all columns
 865        Args:
 866            manifestId: synapse ID of a manifest
 867        """
 868        # get manifest file path
 869        manifest_entity = self.synapse_entity_tracker.get(
 870            synapse_id=manifestId, syn=self.syn, download_file=True
 871        )
 872        manifest_filepath = manifest_entity.path
 873
 874        # load manifest dataframe
 875        manifest = load_df(
 876            manifest_filepath,
 877            preserve_raw_input=False,
 878            data_model=False,
 879        )
 880
 881        # convert the dataFrame to use best possible dtypes.
 882        manifest_new = manifest.convert_dtypes()
 883
 884        # get data types of columns
 885        result = manifest_new.dtypes.to_frame("dtypes").reset_index()
 886
 887        # return the result as a dictionary
 888        result_dict = result.set_index("index")["dtypes"].astype(str).to_dict()
 889
 890        return result_dict
 891
 892    def _get_files_metadata_from_dataset(
 893        self, datasetId: str, only_new_files: bool, manifest: pd.DataFrame = None
 894    ) -> Optional[dict]:
 895        """retrieve file ids under a particular datasetId
 896
 897        Args:
 898            datasetId (str): a dataset id
 899            only_new_files (bool): if only adding new files that are not already exist
 900            manifest (pd.DataFrame): metadata manifest dataframe. Default to None.
 901
 902        Returns:
 903            a dictionary that contains filename and entityid under a given datasetId or None if there is nothing under a given dataset id are not available
 904        """
 905        dataset_files = self.getFilesInStorageDataset(datasetId)
 906        if dataset_files:
 907            dataset_file_names_id_dict = self._get_file_entityIds(
 908                dataset_files, only_new_files=only_new_files, manifest=manifest
 909            )
 910            return dataset_file_names_id_dict
 911        else:
 912            return None
 913
 914    def add_entity_id_and_filename(
 915        self, datasetId: str, manifest: pd.DataFrame
 916    ) -> pd.DataFrame:
 917        """add entityid and filename column to an existing manifest assuming entityId column is not already present
 918
 919        Args:
 920            datasetId (str): dataset syn id
 921            manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty
 922
 923        Returns:
 924            pd.DataFrame: returns a pandas dataframe
 925        """
 926        # get file names and entity ids of a given dataset
 927        dataset_files_dict = self._get_files_metadata_from_dataset(
 928            datasetId, only_new_files=False
 929        )
 930
 931        if dataset_files_dict:
 932            # turn manifest dataframe back to a dictionary for operation
 933            manifest_dict = manifest.to_dict("list")
 934
 935            # update Filename column
 936            # add entityId column to the end
 937            manifest_dict.update(dataset_files_dict)
 938
 939            # if the component column exists in existing manifest, fill up that column
 940            if "Component" in manifest_dict.keys():
 941                manifest_dict["Component"] = manifest_dict["Component"] * max(
 942                    1, len(manifest_dict["Filename"])
 943                )
 944
 945            # turn dictionary back to a dataframe
 946            manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient="index")
 947            manifest_df_updated = manifest_df_index.transpose()
 948
 949            # fill na with empty string
 950            manifest_df_updated = manifest_df_updated.fillna("")
 951
 952            # drop index
 953            manifest_df_updated = manifest_df_updated.reset_index(drop=True)
 954
 955            return manifest_df_updated
 956        else:
 957            return manifest
 958
 959    def fill_in_entity_id_filename(
 960        self, datasetId: str, manifest: pd.DataFrame
 961    ) -> Tuple[List, pd.DataFrame]:
 962        """fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present.
 963
 964        Args:
 965            datasetId (str): dataset syn id
 966            manifest (pd.DataFrame): existing manifest dataframe.
 967
 968        Returns:
 969            Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe
 970        """
 971        # get dataset file names and entity id as a list of tuple
 972        dataset_files = self.getFilesInStorageDataset(datasetId)
 973
 974        # update manifest with additional filenames, if any
 975        # note that if there is an existing manifest and there are files in the dataset
 976        # the columns Filename and entityId are assumed to be present in manifest schema
 977        # TODO: use idiomatic panda syntax
 978        if not dataset_files:
 979            manifest = manifest.fillna("")
 980            return dataset_files, manifest
 981
 982        all_files = self._get_file_entityIds(
 983            dataset_files=dataset_files, only_new_files=False, manifest=manifest
 984        )
 985        new_files = self._get_file_entityIds(
 986            dataset_files=dataset_files, only_new_files=True, manifest=manifest
 987        )
 988
 989        all_files = pd.DataFrame(all_files)
 990        new_files = pd.DataFrame(new_files)
 991
 992        # update manifest so that it contains new dataset files
 993        manifest = (
 994            pd.concat([manifest, new_files], sort=False)
 995            .reset_index()
 996            .drop("index", axis=1)
 997        )
 998
 999        # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata
1000        manifest_reindex = manifest.set_index("entityId")
1001        all_files_reindex = all_files.set_index("entityId")
1002        all_files_reindex_like_manifest = all_files_reindex.reindex_like(
1003            manifest_reindex
1004        )
1005
1006        # Check if individual file paths in manifest and from synapse match
1007        file_paths_match = (
1008            manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"]
1009        )
1010
1011        # If all the paths do not match, update the manifest with the filepaths from synapse
1012        if not file_paths_match.all():
1013            manifest_reindex.loc[
1014                ~file_paths_match, "Filename"
1015            ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"]
1016
1017            # reformat manifest for further use
1018            manifest = manifest_reindex.reset_index()
1019            entityIdCol = manifest.pop("entityId")
1020            manifest.insert(len(manifest.columns), "entityId", entityIdCol)
1021
1022        manifest = manifest.fillna("")
1023        return dataset_files, manifest
1024
1025    @tracer.start_as_current_span("SynapseStorage::updateDatasetManifestFiles")
1026    def updateDatasetManifestFiles(
1027        self, dmge: DataModelGraphExplorer, datasetId: str, store: bool = True
1028    ) -> Union[Tuple[str, pd.DataFrame], None]:
1029        """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any.
1030
1031        Args:
1032            dmge: DataModelGraphExplorer Instance
1033            datasetId: synapse ID of a storage dataset.
1034            store: if set to True store updated manifest in asset store; if set to False
1035            return a Pandas dataframe containing updated manifest but do not store to asset store
1036
1037
1038        Returns:
1039            Synapse ID of updated manifest and Pandas dataframe containing the updated manifest.
1040            If there is no existing manifest or if the manifest does not have an entityId column, return None
1041        """
1042
1043        # get existing manifest Synapse ID
1044        manifest_id = self.getDatasetManifest(datasetId)
1045
1046        # if there is no manifest return None
1047        if not manifest_id:
1048            return None
1049
1050        manifest_entity = self.synapse_entity_tracker.get(
1051            synapse_id=manifest_id, syn=self.syn, download_file=True
1052        )
1053        manifest_filepath = manifest_entity.path
1054        manifest = load_df(manifest_filepath)
1055
1056        # If the manifest does not have an entityId column, trigger a new manifest to be generated
1057        if "entityId" not in manifest.columns:
1058            return None
1059
1060        manifest_is_file_based = "Filename" in manifest.columns
1061
1062        if manifest_is_file_based:
1063            # update manifest with additional filenames, if any
1064            # note that if there is an existing manifest and there are files in the dataset
1065            # the columns Filename and entityId are assumed to be present in manifest schema
1066            # TODO: use idiomatic panda syntax
1067            dataset_files, manifest = self.fill_in_entity_id_filename(
1068                datasetId, manifest
1069            )
1070            if dataset_files:
1071                # update the manifest file, so that it contains the relevant entity IDs
1072                if store:
1073                    manifest.to_csv(manifest_filepath, index=False)
1074
1075                    # store manifest and update associated metadata with manifest on Synapse
1076                    manifest_id = self.associateMetadataWithFiles(
1077                        dmge, manifest_filepath, datasetId
1078                    )
1079
1080        return manifest_id, manifest
1081
1082    def _get_file_entityIds(
1083        self,
1084        dataset_files: List,
1085        only_new_files: bool = False,
1086        manifest: pd.DataFrame = None,
1087    ):
1088        """
1089        Get a dictionary of files in a dataset. Either files that are not in the current manifest or all files
1090
1091        Args:
1092            manifest: metadata manifest
1093            dataset_file: List of all files in a dataset
1094            only_new_files: boolean to control whether only new files are returned or all files in the dataset
1095        Returns:
1096            files: dictionary of file names and entityIDs, with scope as specified by `only_new_files`
1097        """
1098        files = {"Filename": [], "entityId": []}
1099
1100        if only_new_files:
1101            if manifest is None:
1102                raise UnboundLocalError(
1103                    "No manifest was passed in, a manifest is required when `only_new_files` is True."
1104                )
1105
1106            if "entityId" not in manifest.columns:
1107                raise ValueError(
1108                    "The manifest in your dataset and/or top level folder must contain the 'entityId' column. "
1109                    "Please generate an empty manifest without annotations, manually add annotations to the "
1110                    "appropriate files in the manifest, and then try again."
1111                )
1112
1113            # find new files (that are not in the current manifest) if any
1114            for file_id, file_name in dataset_files:
1115                if not file_id in manifest["entityId"].values:
1116                    files["Filename"].append(file_name)
1117                    files["entityId"].append(file_id)
1118        else:
1119            # get all files
1120            for file_id, file_name in dataset_files:
1121                files["Filename"].append(file_name)
1122                files["entityId"].append(file_id)
1123
1124        return files
1125
1126    @tracer.start_as_current_span("SynapseStorage::getProjectManifests")
1127    def getProjectManifests(
1128        self, projectId: str
1129    ) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]:
1130        """Gets all metadata manifest files across all datasets in a specified project.
1131
1132        Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest
1133                 as a list of tuples, one for each manifest:
1134                    [
1135                        (
1136                            (datasetId, dataName),
1137                            (manifestId, manifestName),
1138                            (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema
1139                        ),
1140                        ...
1141                    ]
1142
1143        TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface
1144        """
1145        component = None
1146        entity = None
1147        manifests = []
1148
1149        datasets = self.getStorageDatasetsInProject(projectId)
1150
1151        for datasetId, datasetName in datasets:
1152            # encode information about the manifest in a simple list (so that R clients can unpack it)
1153            # eventually can serialize differently
1154
1155            # Get synID of manifest for a dataset
1156            manifestId = self.getDatasetManifest(datasetId)
1157
1158            # If a manifest exists, get the annotations for it, else return base 'manifest' tuple
1159            if manifestId:
1160                annotations = self.getFileAnnotations(manifestId)
1161
1162                # If manifest has annotations specifying component, use that
1163                if annotations and "Component" in annotations:
1164                    component = annotations["Component"]
1165                    entity = self.synapse_entity_tracker.get(
1166                        synapse_id=manifestId, syn=self.syn, download_file=False
1167                    )
1168                    manifest_name = entity["properties"]["name"]
1169
1170                # otherwise download the manifest and parse for information
1171                elif not annotations or "Component" not in annotations:
1172                    logging.debug(
1173                        f"No component annotations have been found for manifest {manifestId}. "
1174                        "The manifest will be downloaded and parsed instead. "
1175                        "For increased speed, add component annotations to manifest."
1176                    )
1177
1178                    manifest_info = self.getDatasetManifest(
1179                        datasetId, downloadFile=True
1180                    )
1181                    manifest_name = manifest_info["properties"].get("name", "")
1182
1183                    if not manifest_name:
1184                        logger.error(f"Failed to download manifests from {datasetId}")
1185
1186                    manifest_path = manifest_info["path"]
1187
1188                    manifest_df = load_df(manifest_path)
1189
1190                    # Get component from component column if it exists
1191                    if (
1192                        "Component" in manifest_df
1193                        and not manifest_df["Component"].empty
1194                    ):
1195                        list(set(manifest_df["Component"]))
1196                        component = list(set(manifest_df["Component"]))
1197
1198                        # Added to address issues raised during DCA testing
1199                        if "" in component:
1200                            component.remove("")
1201
1202                        if len(component) == 1:
1203                            component = component[0]
1204                        elif len(component) > 1:
1205                            logging.warning(
1206                                f"Manifest {manifestId} is composed of multiple components. Schematic does not support mulit-component manifests at this time."
1207                                "Behavior of manifests with multiple components is undefined"
1208                            )
1209            else:
1210                manifest_name = ""
1211                component = None
1212            if component:
1213                manifest = (
1214                    (datasetId, datasetName),
1215                    (manifestId, manifest_name),
1216                    (component, component),
1217                )
1218            elif manifestId:
1219                logging.debug(
1220                    f"Manifest {manifestId} does not have an associated Component"
1221                )
1222                manifest = (
1223                    (datasetId, datasetName),
1224                    (manifestId, manifest_name),
1225                    ("", ""),
1226                )
1227            else:
1228                manifest = (
1229                    (datasetId, datasetName),
1230                    ("", ""),
1231                    ("", ""),
1232                )
1233
1234            if manifest:
1235                manifests.append(manifest)
1236
1237        return manifests
1238
1239    def upload_project_manifests_to_synapse(
1240        self, dmge: DataModelGraphExplorer, projectId: str
1241    ) -> List[str]:
1242        """Upload all metadata manifest files across all datasets in a specified project as tables in Synapse.
1243
1244        Returns: String of all the manifest_table_ids of all the manifests that have been loaded.
1245        """
1246
1247        manifests = []
1248        manifest_loaded = []
1249        datasets = self.getStorageDatasetsInProject(projectId)
1250
1251        for datasetId, datasetName in datasets:
1252            # encode information about the manifest in a simple list (so that R clients can unpack it)
1253            # eventually can serialize differently
1254
1255            manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1256
1257            manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1258            if manifest_info:
1259                manifest_id = manifest_info["properties"]["id"]
1260                manifest_name = manifest_info["properties"]["name"]
1261                manifest_path = manifest_info["path"]
1262                manifest_df = load_df(manifest_path)
1263                manifest_table_id = uploadDB(
1264                    dmge=dmge,
1265                    manifest=manifest,
1266                    datasetId=datasetId,
1267                    table_name=datasetName,
1268                )
1269                manifest_loaded.append(datasetName)
1270        return manifest_loaded
1271
1272    def upload_annotated_project_manifests_to_synapse(
1273        self, projectId: str, path_to_json_ld: str, dry_run: bool = False
1274    ) -> List[str]:
1275        """
1276        Purpose:
1277            For all manifests in a project, upload them as a table and add annotations manifest csv.
1278            Assumes the manifest is already present as a CSV in a dataset in the project.
1279
1280        """
1281        # Instantiate DataModelParser
1282        data_model_parser = DataModelParser(path_to_data_model=path_to_json_ld)
1283        # Parse Model
1284        parsed_data_model = data_model_parser.parse_model()
1285
1286        # Instantiate DataModelGraph
1287        data_model_grapher = DataModelGraph(parsed_data_model)
1288
1289        # Generate graph
1290        graph_data_model = data_model_grapher.generate_data_model_graph()
1291
1292        # Instantiate DataModelGraphExplorer
1293        dmge = DataModelGraphExplorer(graph_data_model)
1294
1295        manifests = []
1296        manifest_loaded = []
1297        datasets = self.getStorageDatasetsInProject(projectId)
1298        for datasetId, datasetName in datasets:
1299            # encode information about the manifest in a simple list (so that R clients can unpack it)
1300            # eventually can serialize differently
1301
1302            manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1303            manifests.append(manifest)
1304
1305            manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1306
1307            if manifest_info:
1308                manifest_id = manifest_info["properties"]["id"]
1309                manifest_name = manifest_info["properties"]["name"]
1310                manifest_path = manifest_info["path"]
1311                manifest = (
1312                    (datasetId, datasetName),
1313                    (manifest_id, manifest_name),
1314                    ("", ""),
1315                )
1316                if not dry_run:
1317                    self.associateMetadataWithFiles(
1318                        dmge, manifest_path, datasetId, manifest_record_type="table"
1319                    )
1320                manifest_loaded.append(manifest)
1321
1322        return manifests, manifest_loaded
1323
1324    def move_entities_to_new_project(
1325        self,
1326        projectId: str,
1327        newProjectId: str,
1328        returnEntities: bool = False,
1329        dry_run: bool = False,
1330    ):
1331        """
1332        For each manifest csv in a project, look for all the entitiy ids that are associated.
1333        Look up the entitiy in the files, move the entity to new project.
1334        """
1335
1336        manifests = []
1337        manifest_loaded = []
1338        datasets = self.getStorageDatasetsInProject(projectId)
1339        if datasets:
1340            for datasetId, datasetName in datasets:
1341                # encode information about the manifest in a simple list (so that R clients can unpack it)
1342                # eventually can serialize differently
1343
1344                manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1345                manifests.append(manifest)
1346
1347                manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1348                if manifest_info:
1349                    manifest_id = manifest_info["properties"]["id"]
1350                    manifest_name = manifest_info["properties"]["name"]
1351                    manifest_path = manifest_info["path"]
1352                    manifest_df = load_df(manifest_path)
1353
1354                    manifest = (
1355                        (datasetId, datasetName),
1356                        (manifest_id, manifest_name),
1357                        ("", ""),
1358                    )
1359                    manifest_loaded.append(manifest)
1360
1361                    annotation_entities = self.storageFileviewTable[
1362                        (self.storageFileviewTable["id"].isin(manifest_df["entityId"]))
1363                        & (self.storageFileviewTable["type"] == "folder")
1364                    ]["id"]
1365
1366                    if returnEntities:
1367                        for entityId in annotation_entities:
1368                            if not dry_run:
1369                                moved_entity = self.syn.move(entityId, datasetId)
1370                                self.synapse_entity_tracker.add(
1371                                    synapse_id=moved_entity.id, entity=moved_entity
1372                                )
1373                            else:
1374                                logging.info(
1375                                    f"{entityId} will be moved to folder {datasetId}."
1376                                )
1377                    else:
1378                        # generate project folder
1379                        archive_project_folder = Folder(
1380                            projectId + "_archive", parent=newProjectId
1381                        )
1382                        archive_project_folder = self.syn.store(archive_project_folder)
1383                        self.synapse_entity_tracker.add(
1384                            synapse_id=archive_project_folder.id,
1385                            entity=archive_project_folder,
1386                        )
1387
1388                        # generate dataset folder
1389                        dataset_archive_folder = Folder(
1390                            "_".join([datasetId, datasetName, "archive"]),
1391                            parent=archive_project_folder.id,
1392                        )
1393                        dataset_archive_folder = self.syn.store(dataset_archive_folder)
1394                        self.synapse_entity_tracker.add(
1395                            synapse_id=dataset_archive_folder.id,
1396                            entity=dataset_archive_folder,
1397                        )
1398
1399                        for entityId in annotation_entities:
1400                            # move entities to folder
1401                            if not dry_run:
1402                                moved_entity = self.syn.move(
1403                                    entityId, dataset_archive_folder.id
1404                                )
1405                                self.synapse_entity_tracker.add(
1406                                    synapse_id=moved_entity.id, entity=moved_entity
1407                                )
1408                            else:
1409                                logging.info(
1410                                    f"{entityId} will be moved to folder {dataset_archive_folder.id}."
1411                                )
1412        else:
1413            raise LookupError(
1414                f"No datasets were found in the specified project: {projectId}. Re-check specified master_fileview in CONFIG and retry."
1415            )
1416        return manifests, manifest_loaded
1417
1418    @tracer.start_as_current_span("SynapseStorage::get_synapse_table")
1419    def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable]:
1420        """Download synapse table as a pd dataframe; return table schema and etags as results too
1421
1422        Args:
1423            synapse_id: synapse ID of the table to query
1424        """
1425
1426        results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id))
1427        df = results.asDataFrame(
1428            rowIdAndVersionInIndex=False,
1429            na_values=STR_NA_VALUES_FILTERED,
1430            keep_default_na=False,
1431        )
1432
1433        return df, results
1434
1435    @missing_entity_handler
1436    @tracer.start_as_current_span("SynapseStorage::uploadDB")
1437    def uploadDB(
1438        self,
1439        dmge: DataModelGraphExplorer,
1440        manifest: pd.DataFrame,
1441        datasetId: str,
1442        table_name: str,
1443        restrict: bool = False,
1444        table_manipulation: str = "replace",
1445        table_column_names: str = "class_label",
1446    ):
1447        """
1448        Method to upload a database to an asset store. In synapse, this will upload a metadata table
1449
1450        Args:
1451            dmge: DataModelGraphExplorer object
1452            manifest: pd.Df manifest to upload
1453            datasetId: synID of the dataset for the manifest
1454            table_name: name of the table to be uploaded
1455            restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
1456            existingTableId: str of the synId of the existing table, if one already exists
1457            table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
1458            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
1459                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
1460                display label formatting.
1461        Returns:
1462            manifest_table_id: synID of the uploaded table
1463            manifest: the original manifset
1464            table_manifest: manifest formatted appropriately for the table
1465
1466        """
1467
1468        col_schema, table_manifest = self.formatDB(
1469            dmge=dmge, manifest=manifest, table_column_names=table_column_names
1470        )
1471
1472        manifest_table_id = self.buildDB(
1473            datasetId,
1474            table_name,
1475            col_schema,
1476            table_manifest,
1477            table_manipulation,
1478            dmge,
1479            restrict,
1480        )
1481
1482        return manifest_table_id, manifest, table_manifest
1483
1484    @tracer.start_as_current_span("SynapseStorage::formatDB")
1485    def formatDB(self, dmge, manifest, table_column_names):
1486        """
1487        Method to format a manifest appropriatly for upload as table
1488
1489        Args:
1490            dmge: DataModelGraphExplorer object
1491            manifest: pd.Df manifest to upload
1492            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
1493                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
1494                display label formatting.
1495        Returns:
1496            col_schema: schema for table columns: type, size, etc
1497            table_manifest: formatted manifest
1498
1499        """
1500        # Rename the manifest columns to display names to match fileview
1501
1502        blacklist_chars = ["(", ")", ".", " ", "-"]
1503        manifest_columns = manifest.columns.tolist()
1504
1505        table_manifest = deepcopy(manifest)
1506
1507        if table_column_names == "display_name":
1508            cols = table_manifest.columns
1509
1510        elif table_column_names == "display_label":
1511            cols = [
1512                str(col).translate({ord(x): "" for x in blacklist_chars})
1513                for col in manifest_columns
1514            ]
1515
1516        elif table_column_names == "class_label":
1517            cols = [
1518                get_class_label_from_display_name(str(col)).translate(
1519                    {ord(x): "" for x in blacklist_chars}
1520                )
1521                for col in manifest_columns
1522            ]
1523        else:
1524            ValueError(
1525                f"The provided table_column_name: {table_column_names} is not valid, please resubmit with an allowed value only."
1526            )
1527
1528        cols = list(map(lambda x: x.replace("EntityId", "entityId"), cols))
1529
1530        # Reset column names in table manifest
1531        table_manifest.columns = cols
1532
1533        # move entity id to end of df
1534        entity_col = table_manifest.pop("entityId")
1535        table_manifest.insert(len(table_manifest.columns), "entityId", entity_col)
1536
1537        # Get the column schema
1538        col_schema = as_table_columns(table_manifest)
1539
1540        # Set Id column length to 64 (for some reason not being auto set.)
1541        for i, col in enumerate(col_schema):
1542            if col["name"].lower() == "id":
1543                col_schema[i]["maximumSize"] = 64
1544
1545        return col_schema, table_manifest
1546
1547    @tracer.start_as_current_span("SynapseStorage::buildDB")
1548    def buildDB(
1549        self,
1550        datasetId: str,
1551        table_name: str,
1552        col_schema: List,
1553        table_manifest: pd.DataFrame,
1554        table_manipulation: str,
1555        dmge: DataModelGraphExplorer,
1556        restrict: bool = False,
1557    ):
1558        """
1559        Method to construct the table appropriately: create new table, replace existing, or upsert new into existing
1560        Calls TableOperations class to execute
1561
1562        Args:
1563            datasetId: synID of the dataset for the manifest
1564            table_name: name of the table to be uploaded
1565            col_schema: schema for table columns: type, size, etc from `formatDB`
1566            table_manifest: formatted manifest that can be uploaded as a table
1567            table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
1568            restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
1569
1570        Returns:
1571            manifest_table_id: synID of the uploaded table
1572
1573        """
1574        table_parent_id = self.getDatasetProject(datasetId=datasetId)
1575        existing_table_id = self.syn.findEntityId(
1576            name=table_name, parent=table_parent_id
1577        )
1578
1579        tableOps = TableOperations(
1580            synStore=self,
1581            tableToLoad=table_manifest,
1582            tableName=table_name,
1583            datasetId=datasetId,
1584            existingTableId=existing_table_id,
1585            restrict=restrict,
1586            synapse_entity_tracker=self.synapse_entity_tracker,
1587        )
1588
1589        if not table_manipulation or existing_table_id is None:
1590            manifest_table_id = tableOps.createTable(
1591                columnTypeDict=col_schema,
1592                specifySchema=True,
1593            )
1594        elif existing_table_id is not None:
1595            if table_manipulation.lower() == "replace":
1596                manifest_table_id = tableOps.replaceTable(
1597                    specifySchema=True,
1598                    columnTypeDict=col_schema,
1599                )
1600            elif table_manipulation.lower() == "upsert":
1601                manifest_table_id = tableOps.upsertTable(
1602                    dmge=dmge,
1603                )
1604            elif table_manipulation.lower() == "update":
1605                manifest_table_id = tableOps.updateTable()
1606
1607        if table_manipulation and table_manipulation.lower() == "upsert":
1608            table_entity = self.synapse_entity_tracker.get(
1609                synapse_id=existing_table_id or manifest_table_id,
1610                syn=self.syn,
1611                download_file=False,
1612            )
1613            annos = OldAnnotations(
1614                id=table_entity.id,
1615                etag=table_entity.etag,
1616                values=table_entity.annotations,
1617            )
1618            annos["primary_key"] = table_manifest["Component"][0] + "_id"
1619            annos = self.syn.set_annotations(annos)
1620            table_entity.etag = annos.etag
1621            table_entity.annotations = annos
1622
1623        return manifest_table_id
1624
1625    @tracer.start_as_current_span("SynapseStorage::upload_manifest_file")
1626    def upload_manifest_file(
1627        self,
1628        manifest,
1629        metadataManifestPath,
1630        datasetId,
1631        restrict_manifest,
1632        component_name="",
1633    ):
1634        # Update manifest to have the new entityId column
1635        manifest.to_csv(metadataManifestPath, index=False)
1636
1637        # store manifest to Synapse as a CSV
1638        # update file name
1639        file_name_full = metadataManifestPath.split("/")[-1]
1640        file_extension = file_name_full.split(".")[-1]
1641
1642        # Differentiate "censored" and "uncensored" manifest
1643        if "censored" in file_name_full:
1644            file_name_new = (
1645                os.path.basename(CONFIG.synapse_manifest_basename)
1646                + "_"
1647                + component_name
1648                + "_censored"
1649                + "."
1650                + file_extension
1651            )
1652        else:
1653            file_name_new = (
1654                os.path.basename(CONFIG.synapse_manifest_basename)
1655                + "_"
1656                + component_name
1657                + "."
1658                + file_extension
1659            )
1660
1661        manifest_synapse_file = None
1662        try:
1663            # Rename the file to file_name_new then revert
1664            # This is to maintain the original file name in-case other code is
1665            # expecting that the file exists with the original name
1666            original_file_path = metadataManifestPath
1667            new_file_path = os.path.join(
1668                os.path.dirname(metadataManifestPath), file_name_new
1669            )
1670            os.rename(original_file_path, new_file_path)
1671
1672            manifest_synapse_file = self._store_file_for_manifest_upload(
1673                new_file_path=new_file_path,
1674                dataset_id=datasetId,
1675                existing_file_name=file_name_full,
1676                file_name_new=file_name_new,
1677                restrict_manifest=restrict_manifest,
1678            )
1679            manifest_synapse_file_id = manifest_synapse_file.id
1680
1681        finally:
1682            # Revert the file name back to the original
1683            os.rename(new_file_path, original_file_path)
1684
1685            if manifest_synapse_file:
1686                manifest_synapse_file.path = original_file_path
1687
1688        return manifest_synapse_file_id
1689
1690    def _store_file_for_manifest_upload(
1691        self,
1692        new_file_path: str,
1693        dataset_id: str,
1694        existing_file_name: str,
1695        file_name_new: str,
1696        restrict_manifest: bool,
1697    ) -> File:
1698        """Handles a create or update of a manifest file that is going to be uploaded.
1699        If we already have a copy of the Entity in memory we will update that instance,
1700        otherwise create a new File instance to be created in Synapse. Once stored
1701        this will add the file to the `synapse_entity_tracker` for future reference.
1702
1703        Args:
1704            new_file_path (str): The path to the new manifest file
1705            dataset_id (str): The Synapse ID of the dataset the manifest is associated with
1706            existing_file_name (str): The name of the existing file
1707            file_name_new (str): The name of the new file
1708            restrict_manifest (bool): Whether the manifest should be restricted
1709
1710        Returns:
1711            File: The stored manifest file
1712        """
1713        local_tracked_file_instance = (
1714            self.synapse_entity_tracker.search_local_by_parent_and_name(
1715                name=existing_file_name, parent_id=dataset_id
1716            )
1717            or self.synapse_entity_tracker.search_local_by_parent_and_name(
1718                name=file_name_new, parent_id=dataset_id
1719            )
1720        )
1721
1722        if local_tracked_file_instance:
1723            local_tracked_file_instance.path = new_file_path
1724            local_tracked_file_instance.description = (
1725                "Manifest for dataset " + dataset_id
1726            )
1727            manifest_synapse_file = local_tracked_file_instance
1728        else:
1729            manifest_synapse_file = File(
1730                path=new_file_path,
1731                description="Manifest for dataset " + dataset_id,
1732                parent=dataset_id,
1733                name=file_name_new,
1734            )
1735
1736        manifest_synapse_file = self.syn.store(
1737            manifest_synapse_file, isRestricted=restrict_manifest
1738        )
1739
1740        self.synapse_entity_tracker.add(
1741            synapse_id=manifest_synapse_file.id, entity=manifest_synapse_file
1742        )
1743        return manifest_synapse_file
1744
1745    async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]:
1746        """get annotations asynchronously
1747
1748        Args:
1749            synapse_id (str): synapse id of the entity that the annotation belongs
1750
1751        Returns:
1752            Dict[str, Any]: The requested entity bundle matching
1753            <https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html>
1754        """
1755        return await get_entity_id_bundle2(
1756            entity_id=synapse_id,
1757            request={"includeAnnotations": True},
1758            synapse_client=self.syn,
1759        )
1760
1761    async def store_async_annotation(self, annotation_dict: dict) -> Annotations:
1762        """store annotation in an async way
1763
1764        Args:
1765            annotation_dict (dict): annotation in a dictionary format
1766
1767        Returns:
1768            Annotations: The stored annotations.
1769        """
1770        annotation_data = Annotations.from_dict(
1771            synapse_annotations=annotation_dict["annotations"]["annotations"]
1772        )
1773        annotation_class = Annotations(
1774            annotations=annotation_data,
1775            etag=annotation_dict["annotations"]["etag"],
1776            id=annotation_dict["annotations"]["id"],
1777        )
1778        annotation_storage_result = await annotation_class.store_async(
1779            synapse_client=self.syn
1780        )
1781        local_entity = self.synapse_entity_tracker.get(
1782            synapse_id=annotation_dict["annotations"]["id"],
1783            syn=self.syn,
1784            download_file=False,
1785            retrieve_if_not_present=False,
1786        )
1787        if local_entity:
1788            local_entity.etag = annotation_storage_result.etag
1789            local_entity.annotations = annotation_storage_result
1790        return annotation_storage_result
1791
1792    def process_row_annotations(
1793        self,
1794        dmge: DataModelGraphExplorer,
1795        metadata_syn: Dict[str, Any],
1796        hide_blanks: bool,
1797        csv_list_regex: str,
1798        annos: Dict[str, Any],
1799        annotation_keys: str,
1800    ) -> Dict[str, Any]:
1801        """Processes metadata annotations based on the logic below:
1802        1. Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is:
1803            An empty or whitespace-only string.
1804            A NaN value (if the annotation is a float).
1805        if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key.
1806        if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key.
1807
1808        2. If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name".
1809        Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key.
1810
1811        3. For any other conditions, assigns the original value of anno_v to the annotation key (anno_k).
1812
1813        4. Returns the updated annotations dictionary.
1814
1815        Args:
1816            dmge (DataModelGraphExplorer): data model graph explorer
1817            metadata_syn (dict): metadata used for Synapse storage
1818            hideBlanks (bool): if true, does not upload annotation keys with blank values.
1819            csv_list_regex (str): Regex to match with comma separated list
1820            annos (Dict[str, Any]): dictionary of annotation returned from synapse
1821            annotation_keys (str): display_label/class_label
1822
1823        Returns:
1824            Dict[str, Any]: annotations as a dictionary
1825
1826        ```mermaid
1827        flowchart TD
1828            A[Start] --> C{Is anno_v empty, whitespace, or NaN?}
1829            C -- Yes --> D{Is hide_blanks True?}
1830            D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing]
1831            D -- No --> F[Assign empty string to annotation key]
1832            C -- No --> G{Is anno_v a string?}
1833            G -- No --> H[Assign original value of anno_v to annotation key]
1834            G -- Yes --> I{Does anno_v match csv_list_regex?}
1835            I -- Yes --> J[Get validation rule of anno_k]
1836            J --> K{Does the validation rule contain 'list'}
1837            K -- Yes --> L[Split anno_v by commas and assign as list]
1838            I -- No --> H
1839            K -- No --> H
1840        ```
1841        """
1842        for anno_k, anno_v in metadata_syn.items():
1843            # Remove keys with nan or empty string values or string that only contains white space from dict of annotations to be uploaded
1844            # if present on current data annotation
1845            if hide_blanks and (
1846                (isinstance(anno_v, str) and anno_v.strip() == "")
1847                or (isinstance(anno_v, float) and np.isnan(anno_v))
1848            ):
1849                annos["annotations"]["annotations"].pop(anno_k) if anno_k in annos[
1850                    "annotations"
1851                ]["annotations"].keys() else annos["annotations"]["annotations"]
1852                continue
1853
1854            # Otherwise save annotation as approrpriate
1855            if isinstance(anno_v, float) and np.isnan(anno_v):
1856                annos["annotations"]["annotations"][anno_k] = ""
1857                continue
1858
1859            # Handle strings that match the csv_list_regex and pass the validation rule
1860            if isinstance(anno_v, str) and re.fullmatch(csv_list_regex, anno_v):
1861                # Use a dictionary to dynamically choose the argument
1862                param = (
1863                    {"node_display_name": anno_k}
1864                    if annotation_keys == "display_label"
1865                    else {"node_label": anno_k}
1866                )
1867                node_validation_rules = dmge.get_node_validation_rules(**param)
1868
1869                if rule_in_rule_list("list", node_validation_rules):
1870                    annos["annotations"]["annotations"][anno_k] = anno_v.split(",")
1871                    continue
1872            # default: assign the original value
1873            annos["annotations"]["annotations"][anno_k] = anno_v
1874
1875        return annos
1876
1877    @async_missing_entity_handler
1878    async def format_row_annotations(
1879        self,
1880        dmge: DataModelGraphExplorer,
1881        row: pd.Series,
1882        entityId: str,
1883        hideBlanks: bool,
1884        annotation_keys: str,
1885    ) -> Union[None, Dict[str, Any]]:
1886        """Format row annotations
1887
1888        Args:
1889            dmge (DataModelGraphExplorer): data moodel graph explorer object
1890            row (pd.Series): row of the manifest
1891            entityId (str): entity id of the manifest
1892            hideBlanks (bool): when true, does not upload annotation keys with blank values. When false, upload Annotation keys with empty string values
1893            annotation_keys (str): display_label/class_label
1894
1895        Returns:
1896            Union[None, Dict[str,]]: if entity id is in trash can, return None. Otherwise, return the annotations
1897        """
1898        # prepare metadata for Synapse storage (resolve display name into a name that Synapse annotations support (e.g no spaces, parenthesis)
1899        # note: the removal of special characters, will apply only to annotation keys; we are not altering the manifest
1900        # this could create a divergence between manifest column and annotations. this should be ok for most use cases.
1901        # columns with special characters are outside of the schema
1902        metadataSyn = {}
1903        blacklist_chars = ["(", ")", ".", " ", "-"]
1904
1905        for k, v in row.to_dict().items():
1906            if annotation_keys == "display_label":
1907                keySyn = str(k).translate({ord(x): "" for x in blacklist_chars})
1908            elif annotation_keys == "class_label":
1909                keySyn = get_class_label_from_display_name(str(k)).translate(
1910                    {ord(x): "" for x in blacklist_chars}
1911                )
1912
1913            # Skip `Filename` and `ETag` columns when setting annotations
1914            if keySyn in ["Filename", "ETag", "eTag"]:
1915                continue
1916
1917            # truncate annotation values to 500 characters if the
1918            # size of values is greater than equal to 500 characters
1919            # add an explicit [truncatedByDataCuratorApp] message at the end
1920            # of every truncated message to indicate that the cell value
1921            # has been truncated
1922            if isinstance(v, str) and len(v) >= 500:
1923                v = v[0:472] + "[truncatedByDataCuratorApp]"
1924
1925            metadataSyn[keySyn] = v
1926
1927        # This will first check if the entity is already in memory, and if so, that
1928        # instance is used. Unfortunately, the expected return format needs to match
1929        # the Synapse API, so we need to convert the annotations to the expected format.
1930        entity = self.synapse_entity_tracker.get(
1931            synapse_id=entityId,
1932            syn=self.syn,
1933            download_file=False,
1934            retrieve_if_not_present=False,
1935        )
1936        if entity is not None:
1937            synapse_annotations = _convert_to_annotations_list(
1938                annotations=entity.annotations
1939            )
1940            annos = {
1941                "annotations": {
1942                    "id": entity.id,
1943                    "etag": entity.etag,
1944                    "annotations": synapse_annotations,
1945                }
1946            }
1947        else:
1948            annos = await self.get_async_annotation(entityId)
1949
1950        # set annotation(s) for the various objects/items in a dataset on Synapse
1951        csv_list_regex = comma_separated_list_regex()
1952
1953        annos = self.process_row_annotations(
1954            dmge=dmge,
1955            metadata_syn=metadataSyn,
1956            hide_blanks=hideBlanks,
1957            csv_list_regex=csv_list_regex,
1958            annos=annos,
1959            annotation_keys=annotation_keys,
1960        )
1961
1962        return annos
1963
1964    @missing_entity_handler
1965    @tracer.start_as_current_span("SynapseStorage::format_manifest_annotations")
1966    def format_manifest_annotations(self, manifest, manifest_synapse_id):
1967        """
1968        Set annotations for the manifest (as a whole) so they can be applied to the manifest table or csv.
1969        For now just getting the Component.
1970        """
1971
1972        entity = self.synapse_entity_tracker.get(
1973            synapse_id=manifest_synapse_id, syn=self.syn, download_file=False
1974        )
1975        is_file = entity.concreteType.endswith(".FileEntity")
1976        is_table = entity.concreteType.endswith(".TableEntity")
1977
1978        if is_file:
1979            # Get file metadata
1980            metadata = self.getFileAnnotations(manifest_synapse_id)
1981
1982            # If there is a defined component add it to the metadata.
1983            if "Component" in manifest.columns:
1984                # Gather component information
1985                component = manifest["Component"].unique()
1986
1987                # Double check that only a single component is listed, else raise an error.
1988                try:
1989                    len(component) == 1
1990                except ValueError as err:
1991                    raise ValueError(
1992                        f"Manifest has more than one component. Please check manifest and resubmit."
1993                    ) from err
1994
1995                # Add component to metadata
1996                metadata["Component"] = component[0]
1997
1998        elif is_table:
1999            # Get table metadata
2000            metadata = self.getTableAnnotations(manifest_synapse_id)
2001
2002        # Get annotations
2003        annos = OldAnnotations(
2004            id=entity.id, etag=entity.etag, values=entity.annotations
2005        )
2006
2007        # Add metadata to the annotations
2008        for annos_k, annos_v in metadata.items():
2009            annos[annos_k] = annos_v
2010
2011        return annos
2012
2013    '''
2014    def annotate_upload_manifest_table(self, manifest, datasetId, metadataManifestPath,
2015        useSchemaLabel: bool = True, hideBlanks: bool = False, restrict_manifest = False):
2016        """
2017        Purpose:
2018            Works very similarly to associateMetadataWithFiles except takes in the manifest
2019            rather than the manifest path
2020
2021        """
2022
2023        # Add uuid for table updates and fill.
2024        if not "Uuid" in manifest.columns:
2025            manifest["Uuid"] = ''
2026
2027        for idx,row in manifest.iterrows():
2028            if not row["Uuid"]:
2029                gen_uuid = uuid.uuid4()
2030                row["Uuid"] = gen_uuid
2031                manifest.loc[idx, 'Uuid'] = gen_uuid
2032
2033        # add entityId as a column if not already there or
2034        # fill any blanks with an empty string.
2035        if not "entityId" in manifest.columns:
2036            manifest["entityId"] = ""
2037        else:
2038            manifest["entityId"].fillna("", inplace=True)
2039
2040        # get a DataModelGraphExplorer object to ensure schema attribute names used in manifest are translated to schema labels for synapse annotations
2041        dmge = DataModelGraphExplorer()
2042
2043        # Create table name here.
2044        if 'Component' in manifest.columns:
2045            table_name = manifest['Component'][0].lower() + '_synapse_storage_manifest_table'
2046        else:
2047            table_name = 'synapse_storage_manifest_table'
2048
2049        # Upload manifest as a table and get the SynID and manifest
2050        manifest_synapse_table_id, manifest, table_manifest = self.upload_format_manifest_table(
2051                                                    dmge, manifest, datasetId, table_name, restrict = restrict_manifest, useSchemaLabel=useSchemaLabel,)
2052
2053        # Iterate over manifest rows, create Synapse entities and store corresponding entity IDs in manifest if needed
2054        # also set metadata for each synapse entity as Synapse annotations
2055        for idx, row in manifest.iterrows():
2056            if not row["entityId"]:
2057                # If not using entityIds, fill with manifest_table_id so
2058                row["entityId"] = manifest_synapse_table_id
2059                entityId = ''
2060            else:
2061                # get the entity id corresponding to this row
2062                entityId = row["entityId"]
2063
2064        # Load manifest to synapse as a CSV File
2065        manifest_synapse_file_id = self.upload_manifest_file(manifest, metadataManifestPath, datasetId, restrict_manifest)
2066
2067        # Get annotations for the file manifest.
2068        manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_file_id)
2069
2070        self.syn.set_annotations(manifest_annotations)
2071
2072        logger.info("Associated manifest file with dataset on Synapse.")
2073
2074        # Update manifest Synapse table with new entity id column.
2075        self.make_synapse_table(
2076            table_to_load = table_manifest,
2077            dataset_id = datasetId,
2078            existingTableId = manifest_synapse_table_id,
2079            table_name = table_name,
2080            update_col = 'Uuid',
2081            specify_schema = False,
2082            )
2083
2084        # Get annotations for the table manifest
2085        manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_table_id)
2086        self.syn.set_annotations(manifest_annotations)
2087        return manifest_synapse_table_id
2088    '''
2089
2090    def _read_manifest(self, metadataManifestPath: str) -> pd.DataFrame:
2091        """Helper function to read in provided manifest as a pandas DataFrame for subsequent downstream processing.
2092        Args:
2093            metadataManifestPath (str): path where manifest is stored
2094        Returns:
2095            manifest(pd.DataFrame): Manifest loaded as a pandas dataframe
2096        Raises:
2097            FileNotFoundError: Manifest file does not exist at provided path.
2098        """
2099        # read new manifest csv
2100        try:
2101            load_args = {
2102                "dtype": "string",
2103            }
2104            manifest = load_df(
2105                metadataManifestPath,
2106                preserve_raw_input=False,
2107                allow_na_values=False,
2108                **load_args,
2109            )
2110        except FileNotFoundError as err:
2111            raise FileNotFoundError(
2112                f"No manifest file was found at this path: {metadataManifestPath}"
2113            ) from err
2114        return manifest
2115
2116    def _add_id_columns_to_manifest(
2117        self, manifest: pd.DataFrame, dmge: DataModelGraphExplorer
2118    ):
2119        """Helper function to add id and entityId columns to the manifest if they do not already exist, Fill id values per row.
2120        Args:
2121            Manifest loaded as a pd.Dataframe
2122        Returns (pd.DataFrame):
2123            Manifest df with new Id and EntityId columns (and UUID values) if they were not already present.
2124        """
2125
2126        # Add Id for table updates and fill.
2127        if not col_in_dataframe("Id", manifest):
2128            # See if schema has `Uuid` column specified
2129            try:
2130                uuid_col_in_schema = dmge.is_class_in_schema(
2131                    "Uuid"
2132                ) or dmge.is_class_in_schema("uuid")
2133            except KeyError:
2134                uuid_col_in_schema = False
2135
2136            # Rename `Uuid` column if it wasn't specified in the schema
2137            if col_in_dataframe("Uuid", manifest) and not uuid_col_in_schema:
2138                manifest.rename(columns={"Uuid": "Id"}, inplace=True)
2139            # If no `Uuid` column exists or it is specified in the schema, create a new `Id` column
2140            else:
2141                manifest["Id"] = ""
2142
2143        # Retrieve the ID column name (id, Id and ID) are treated the same.
2144        id_col_name = [col for col in manifest.columns if col.lower() == "id"][0]
2145
2146        # Check if values have been added to the Id coulumn, if not add a UUID so value in the row is not blank.
2147        for idx, row in manifest.iterrows():
2148            if not row[id_col_name]:
2149                gen_uuid = str(uuid.uuid4())
2150                row[id_col_name] = gen_uuid
2151                manifest.loc[idx, id_col_name] = gen_uuid
2152
2153        # add entityId as a column if not already there or
2154        # fill any blanks with an empty string.
2155        if not col_in_dataframe("entityId", manifest):
2156            manifest["entityId"] = ""
2157        else:
2158            manifest["entityId"].fillna("", inplace=True)
2159
2160        return manifest
2161
2162    def _generate_table_name(self, manifest):
2163        """Helper function to generate a table name for upload to synapse.
2164
2165        Args:
2166            Manifest loaded as a pd.Dataframe
2167
2168        Returns:
2169            table_name (str): Name of the table to load
2170            component_name (str): Name of the manifest component (if applicable)
2171        """
2172        # Create table name here.
2173        if "Component" in manifest.columns:
2174            component_name = manifest["Component"][0].lower()
2175            table_name = component_name + "_synapse_storage_manifest_table"
2176        else:
2177            component_name = ""
2178            table_name = "synapse_storage_manifest_table"
2179        return table_name, component_name
2180
2181    def _create_entity_id(self, idx, row, manifest, datasetId):
2182        """Helper function to generate an entityId and add it to the appropriate row in the manifest.
2183        Args:
2184            row: current row of manifest being processed
2185            manifest (pd.DataFrame): loaded df containing user supplied data.
2186            datasetId (str): synapse ID of folder containing the dataset
2187
2188        Returns:
2189            manifest (pd.DataFrame): manifest with entityId added to the appropriate row
2190            entityId (str): Generated Entity Id.
2191
2192        """
2193        rowEntity = Folder(str(uuid.uuid4()), parent=datasetId)
2194        rowEntity = self.syn.store(rowEntity)
2195        entityId = rowEntity["id"]
2196        self.synapse_entity_tracker.add(synapse_id=entityId, entity=rowEntity)
2197        row["entityId"] = entityId
2198        manifest.loc[idx, "entityId"] = entityId
2199        return manifest, entityId
2200
2201    async def _process_store_annos(self, requests: Set[asyncio.Task]) -> None:
2202        """Process annotations and store them on synapse asynchronously
2203
2204        Args:
2205            requests (Set[asyncio.Task]): a set of tasks of formatting annotations created by format_row_annotations function in previous step
2206
2207        Raises:
2208            RuntimeError: raise a run time error if a task failed to complete
2209        """
2210        while requests:
2211            done_tasks, pending_tasks = await asyncio.wait(
2212                requests, return_when=asyncio.FIRST_COMPLETED
2213            )
2214            requests = pending_tasks
2215
2216            for completed_task in done_tasks:
2217                try:
2218                    annos = completed_task.result()
2219
2220                    if isinstance(annos, Annotations):
2221                        logger.info(f"Successfully stored annotations for {annos.id}")
2222                    else:
2223                        # store annotations if they are not None
2224                        if annos:
2225                            entity_id = annos["annotations"]["id"]
2226                            logger.info(
2227                                f"Obtained and processed annotations for {entity_id} entity"
2228                            )
2229                            requests.add(
2230                                asyncio.create_task(
2231                                    self.store_async_annotation(annotation_dict=annos)
2232                                )
2233                            )
2234                except Exception as e:
2235                    raise RuntimeError(f"failed with { repr(e) }.") from e
2236
2237    @tracer.start_as_current_span("SynapseStorage::add_annotations_to_entities_files")
2238    async def add_annotations_to_entities_files(
2239        self,
2240        dmge,
2241        manifest,
2242        manifest_record_type: str,
2243        datasetId: str,
2244        hideBlanks: bool,
2245        manifest_synapse_table_id="",
2246        annotation_keys: str = "class_label",
2247    ):
2248        """
2249        Depending on upload type add Ids to entityId row. Add anotations to connected
2250        files and folders. Despite the name of this function, it also applies to folders.
2251
2252        Args:
2253            dmge: DataModelGraphExplorer Object
2254            manifest (pd.DataFrame): loaded df containing user supplied data.
2255            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2256            datasetId (str): synapse ID of folder containing the dataset
2257            hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2258            manifest_synapse_table_id (str): Default is an empty string ''.
2259            annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display
2260                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2261                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2262        Returns:
2263            manifest (pd.DataFrame): modified to add entitiyId as appropriate
2264
2265        """
2266
2267        # Expected behavior is to annotate files if `Filename` is present and if file_annotations_upload is set to True regardless of `-mrt` setting
2268        if "filename" in [col.lower() for col in manifest.columns]:
2269            # get current list of files and store as dataframe
2270            dataset_files = self.getFilesInStorageDataset(datasetId)
2271            files_and_entityIds = self._get_file_entityIds(
2272                dataset_files=dataset_files, only_new_files=False
2273            )
2274            file_df = pd.DataFrame(files_and_entityIds)
2275
2276            # Merge dataframes to add entityIds
2277            manifest = manifest.merge(
2278                file_df, how="left", on="Filename", suffixes=["_x", None]
2279            ).drop("entityId_x", axis=1)
2280
2281        # Fill `entityId` for each row if missing and annotate entity as appropriate
2282        requests = set()
2283        for idx, row in manifest.iterrows():
2284            if not row["entityId"] and (
2285                manifest_record_type == "file_and_entities"
2286                or manifest_record_type == "table_file_and_entities"
2287            ):
2288                manifest, entityId = self._create_entity_id(
2289                    idx, row, manifest, datasetId
2290                )
2291            elif not row["entityId"] and manifest_record_type == "table_and_file":
2292                # If not using entityIds, fill with manifest_table_id so
2293                row["entityId"] = manifest_synapse_table_id
2294                manifest.loc[idx, "entityId"] = manifest_synapse_table_id
2295                entityId = ""
2296                # If the row is the manifest table, do not add annotations
2297            elif row["entityId"] == manifest_synapse_table_id:
2298                entityId = ""
2299            else:
2300                # get the file id of the file to annotate, collected in above step.
2301                entityId = row["entityId"]
2302
2303            # Adding annotations to connected files.
2304            if entityId:
2305                # Format annotations for Synapse
2306                annos_task = asyncio.create_task(
2307                    self.format_row_annotations(
2308                        dmge, row, entityId, hideBlanks, annotation_keys
2309                    )
2310                )
2311                requests.add(annos_task)
2312        await self._process_store_annos(requests)
2313        return manifest
2314
2315    @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_table")
2316    def upload_manifest_as_table(
2317        self,
2318        dmge: DataModelGraphExplorer,
2319        manifest: pd.DataFrame,
2320        metadataManifestPath: str,
2321        datasetId: str,
2322        table_name: str,
2323        component_name: str,
2324        restrict: bool,
2325        manifest_record_type: str,
2326        hideBlanks: bool,
2327        table_manipulation: str,
2328        table_column_names: str,
2329        annotation_keys: str,
2330        file_annotations_upload: bool = True,
2331    ):
2332        """Upload manifest to Synapse as a table and csv.
2333        Args:
2334            dmge: DataModelGraphExplorer object
2335            manifest (pd.DataFrame): loaded df containing user supplied data.
2336            metadataManifestPath: path to csv containing a validated metadata manifest.
2337            datasetId (str): synapse ID of folder containing the dataset
2338            table_name (str): Generated to name the table being uploaded.
2339            component_name (str): Name of the component manifest that is currently being uploaded.
2340            restrict (bool): Flag for censored data.
2341            manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2342            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2343            table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2344            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2345                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2346                display label formatting.
2347            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2348                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2349                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2350            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2351        Return:
2352            manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2353        """
2354        # Upload manifest as a table, get the ID and updated manifest.
2355        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2356            dmge=dmge,
2357            manifest=manifest,
2358            datasetId=datasetId,
2359            table_name=table_name,
2360            restrict=restrict,
2361            table_manipulation=table_manipulation,
2362            table_column_names=table_column_names,
2363        )
2364
2365        if file_annotations_upload:
2366            manifest = asyncio.run(
2367                self.add_annotations_to_entities_files(
2368                    dmge,
2369                    manifest,
2370                    manifest_record_type,
2371                    datasetId,
2372                    hideBlanks,
2373                    manifest_synapse_table_id,
2374                    annotation_keys,
2375                )
2376            )
2377        # Load manifest to synapse as a CSV File
2378        manifest_synapse_file_id = self.upload_manifest_file(
2379            manifest=manifest,
2380            metadataManifestPath=metadataManifestPath,
2381            datasetId=datasetId,
2382            restrict_manifest=restrict,
2383            component_name=component_name,
2384        )
2385
2386        # Set annotations for the file manifest.
2387        manifest_annotations = self.format_manifest_annotations(
2388            manifest=manifest, manifest_synapse_id=manifest_synapse_file_id
2389        )
2390        annos = self.syn.set_annotations(annotations=manifest_annotations)
2391        manifest_entity = self.synapse_entity_tracker.get(
2392            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2393        )
2394        manifest_entity.annotations = annos
2395        manifest_entity.etag = annos.etag
2396
2397        logger.info("Associated manifest file with dataset on Synapse.")
2398
2399        # Update manifest Synapse table with new entity id column.
2400        manifest_synapse_table_id, manifest, _ = self.uploadDB(
2401            dmge=dmge,
2402            manifest=manifest,
2403            datasetId=datasetId,
2404            table_name=table_name,
2405            restrict=restrict,
2406            table_manipulation="update",
2407            table_column_names=table_column_names,
2408        )
2409
2410        # Set annotations for the table manifest
2411        manifest_annotations = self.format_manifest_annotations(
2412            manifest=manifest, manifest_synapse_id=manifest_synapse_table_id
2413        )
2414        annotations_manifest_table = self.syn.set_annotations(
2415            annotations=manifest_annotations
2416        )
2417        manifest_table_entity = self.synapse_entity_tracker.get(
2418            synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False
2419        )
2420        manifest_table_entity.annotations = annotations_manifest_table
2421        manifest_table_entity.etag = annotations_manifest_table.etag
2422
2423        return manifest_synapse_file_id
2424
2425    @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_csv")
2426    def upload_manifest_as_csv(
2427        self,
2428        dmge,
2429        manifest,
2430        metadataManifestPath,
2431        datasetId,
2432        restrict,
2433        manifest_record_type,
2434        hideBlanks,
2435        component_name,
2436        annotation_keys: str,
2437        file_annotations_upload: bool = True,
2438    ):
2439        """Upload manifest to Synapse as a csv only.
2440        Args:
2441            dmge: DataModelGraphExplorer object
2442            manifest (pd.DataFrame): loaded df containing user supplied data.
2443            metadataManifestPath: path to csv containing a validated metadata manifest.
2444            datasetId (str): synapse ID of folder containing the dataset
2445            restrict (bool): Flag for censored data.
2446            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2447            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2448            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2449                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2450                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2451            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2452        Return:
2453            manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2454        """
2455        if file_annotations_upload:
2456            manifest = asyncio.run(
2457                self.add_annotations_to_entities_files(
2458                    dmge,
2459                    manifest,
2460                    manifest_record_type,
2461                    datasetId,
2462                    hideBlanks,
2463                    annotation_keys=annotation_keys,
2464                )
2465            )
2466
2467        # Load manifest to synapse as a CSV File
2468        manifest_synapse_file_id = self.upload_manifest_file(
2469            manifest,
2470            metadataManifestPath,
2471            datasetId,
2472            restrict,
2473            component_name=component_name,
2474        )
2475
2476        # Set annotations for the file manifest.
2477        manifest_annotations = self.format_manifest_annotations(
2478            manifest, manifest_synapse_file_id
2479        )
2480        annos = self.syn.set_annotations(manifest_annotations)
2481        manifest_entity = self.synapse_entity_tracker.get(
2482            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2483        )
2484        manifest_entity.annotations = annos
2485        manifest_entity.etag = annos.etag
2486
2487        logger.info("Associated manifest file with dataset on Synapse.")
2488
2489        return manifest_synapse_file_id
2490
2491    @tracer.start_as_current_span("SynapseStorage::upload_manifest_combo")
2492    def upload_manifest_combo(
2493        self,
2494        dmge,
2495        manifest,
2496        metadataManifestPath,
2497        datasetId,
2498        table_name,
2499        component_name,
2500        restrict,
2501        manifest_record_type,
2502        hideBlanks,
2503        table_manipulation,
2504        table_column_names: str,
2505        annotation_keys: str,
2506        file_annotations_upload: bool = True,
2507    ):
2508        """Upload manifest to Synapse as a table and CSV with entities.
2509        Args:
2510            dmge: DataModelGraphExplorer object
2511            manifest (pd.DataFrame): loaded df containing user supplied data.
2512            metadataManifestPath: path to csv containing a validated metadata manifest.
2513            datasetId (str): synapse ID of folder containing the dataset
2514            table_name (str): Generated to name the table being uploaded.
2515            component_name (str): Name of the component manifest that is currently being uploaded.
2516            restrict (bool): Flag for censored data.
2517            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2518            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2519            table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2520            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2521                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2522                display label formatting.
2523            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2524                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2525                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2526            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2527        Return:
2528            manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2529        """
2530        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2531            dmge=dmge,
2532            manifest=manifest,
2533            datasetId=datasetId,
2534            table_name=table_name,
2535            restrict=restrict,
2536            table_manipulation=table_manipulation,
2537            table_column_names=table_column_names,
2538        )
2539
2540        if file_annotations_upload:
2541            manifest = asyncio.run(
2542                self.add_annotations_to_entities_files(
2543                    dmge,
2544                    manifest,
2545                    manifest_record_type,
2546                    datasetId,
2547                    hideBlanks,
2548                    manifest_synapse_table_id,
2549                    annotation_keys=annotation_keys,
2550                )
2551            )
2552
2553        # Load manifest to synapse as a CSV File
2554        manifest_synapse_file_id = self.upload_manifest_file(
2555            manifest, metadataManifestPath, datasetId, restrict, component_name
2556        )
2557
2558        # Set annotations for the file manifest.
2559        manifest_annotations = self.format_manifest_annotations(
2560            manifest, manifest_synapse_file_id
2561        )
2562        file_manifest_annoations = self.syn.set_annotations(manifest_annotations)
2563        manifest_entity = self.synapse_entity_tracker.get(
2564            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2565        )
2566        manifest_entity.annotations = file_manifest_annoations
2567        manifest_entity.etag = file_manifest_annoations.etag
2568        logger.info("Associated manifest file with dataset on Synapse.")
2569
2570        # Update manifest Synapse table with new entity id column.
2571        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2572            dmge=dmge,
2573            manifest=manifest,
2574            datasetId=datasetId,
2575            table_name=table_name,
2576            restrict=restrict,
2577            table_manipulation="update",
2578            table_column_names=table_column_names,
2579        )
2580
2581        # Set annotations for the table manifest
2582        manifest_annotations = self.format_manifest_annotations(
2583            manifest, manifest_synapse_table_id
2584        )
2585        table_manifest_annotations = self.syn.set_annotations(manifest_annotations)
2586        manifest_entity = self.synapse_entity_tracker.get(
2587            synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False
2588        )
2589        manifest_entity.annotations = table_manifest_annotations
2590        manifest_entity.etag = table_manifest_annotations.etag
2591        return manifest_synapse_file_id
2592
2593    @tracer.start_as_current_span("SynapseStorage::associateMetadataWithFiles")
2594    def associateMetadataWithFiles(
2595        self,
2596        dmge: DataModelGraphExplorer,
2597        metadataManifestPath: str,
2598        datasetId: str,
2599        manifest_record_type: str = "table_file_and_entities",
2600        hideBlanks: bool = False,
2601        restrict_manifest=False,
2602        table_manipulation: str = "replace",
2603        table_column_names: str = "class_label",
2604        annotation_keys: str = "class_label",
2605        file_annotations_upload: bool = True,
2606    ) -> str:
2607        """Associate metadata with files in a storage dataset already on Synapse.
2608        Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file.
2609
2610        If this is a new manifest there could be no Synapse entities associated with the rows of this manifest
2611        this may be due to data type (e.g. clinical data) being tabular
2612        and not requiring files; to utilize uniform interfaces downstream
2613        (i.e. fileviews), a Synapse entity (a folder) is created for each row
2614        and an entity column is added to the manifest containing the resulting
2615        entity IDs; a table is also created at present as an additional interface
2616        for downstream query and interaction with the data.
2617
2618        Args:
2619            dmge: DataModelGraphExplorer Object
2620            metadataManifestPath: path to csv containing a validated metadata manifest.
2621            The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type.
2622            Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item.
2623            In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file.
2624            datasetId: synapse ID of folder containing the dataset
2625            manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination.
2626            hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2627            restrict_manifest (bool): Default is false. Flag for censored data.
2628            table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2629            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2630                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2631                display label formatting.
2632            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2633                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2634                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2635        Returns:
2636            manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2637        """
2638        # Read new manifest CSV:
2639        manifest = self._read_manifest(metadataManifestPath)
2640        manifest = self._add_id_columns_to_manifest(manifest, dmge)
2641
2642        table_name, component_name = self._generate_table_name(manifest)
2643
2644        # Upload manifest to synapse based on user input (manifest_record_type)
2645        if manifest_record_type == "file_only":
2646            manifest_synapse_file_id = self.upload_manifest_as_csv(
2647                dmge=dmge,
2648                manifest=manifest,
2649                metadataManifestPath=metadataManifestPath,
2650                datasetId=datasetId,
2651                restrict=restrict_manifest,
2652                hideBlanks=hideBlanks,
2653                manifest_record_type=manifest_record_type,
2654                component_name=component_name,
2655                annotation_keys=annotation_keys,
2656                file_annotations_upload=file_annotations_upload,
2657            )
2658        elif manifest_record_type == "table_and_file":
2659            manifest_synapse_file_id = self.upload_manifest_as_table(
2660                dmge=dmge,
2661                manifest=manifest,
2662                metadataManifestPath=metadataManifestPath,
2663                datasetId=datasetId,
2664                table_name=table_name,
2665                component_name=component_name,
2666                restrict=restrict_manifest,
2667                hideBlanks=hideBlanks,
2668                manifest_record_type=manifest_record_type,
2669                table_manipulation=table_manipulation,
2670                table_column_names=table_column_names,
2671                annotation_keys=annotation_keys,
2672                file_annotations_upload=file_annotations_upload,
2673            )
2674        elif manifest_record_type == "file_and_entities":
2675            manifest_synapse_file_id = self.upload_manifest_as_csv(
2676                dmge=dmge,
2677                manifest=manifest,
2678                metadataManifestPath=metadataManifestPath,
2679                datasetId=datasetId,
2680                restrict=restrict_manifest,
2681                hideBlanks=hideBlanks,
2682                manifest_record_type=manifest_record_type,
2683                component_name=component_name,
2684                annotation_keys=annotation_keys,
2685                file_annotations_upload=file_annotations_upload,
2686            )
2687        elif manifest_record_type == "table_file_and_entities":
2688            manifest_synapse_file_id = self.upload_manifest_combo(
2689                dmge=dmge,
2690                manifest=manifest,
2691                metadataManifestPath=metadataManifestPath,
2692                datasetId=datasetId,
2693                table_name=table_name,
2694                component_name=component_name,
2695                restrict=restrict_manifest,
2696                hideBlanks=hideBlanks,
2697                manifest_record_type=manifest_record_type,
2698                table_manipulation=table_manipulation,
2699                table_column_names=table_column_names,
2700                annotation_keys=annotation_keys,
2701                file_annotations_upload=file_annotations_upload,
2702            )
2703        else:
2704            raise ValueError("Please enter a valid manifest_record_type.")
2705        return manifest_synapse_file_id
2706
2707    def getTableAnnotations(self, table_id: str):
2708        """Generate dictionary of annotations for the given Synapse file.
2709        Synapse returns all custom annotations as lists since they
2710        can contain multiple values. In all cases, the values will
2711        be converted into strings and concatenated with ", ".
2712
2713        Args:
2714            fileId (str): Synapse ID for dataset file.
2715
2716        Returns:
2717            dict: Annotations as comma-separated strings.
2718        """
2719        try:
2720            entity = self.synapse_entity_tracker.get(
2721                synapse_id=table_id, syn=self.syn, download_file=False
2722            )
2723            is_table = entity.concreteType.endswith(".TableEntity")
2724            annotations_raw = entity.annotations
2725        except SynapseHTTPError:
2726            # If an error occurs with retrieving entity, skip it
2727            # This could be caused by a temporary file view that
2728            # was deleted since its ID was retrieved
2729            is_file, is_table = False, False
2730
2731        # Skip anything that isn't a file or folder
2732        if not (is_table):
2733            return None
2734
2735        annotations = self.getEntityAnnotations(table_id, entity, annotations_raw)
2736
2737        return annotations
2738
2739    def getFileAnnotations(self, fileId: str) -> Dict[str, str]:
2740        """Generate dictionary of annotations for the given Synapse file.
2741        Synapse returns all custom annotations as lists since they
2742        can contain multiple values. In all cases, the values will
2743        be converted into strings and concatenated with ", ".
2744
2745        Args:
2746            fileId (str): Synapse ID for dataset file.
2747
2748        Returns:
2749            dict: Annotations as comma-separated strings.
2750        """
2751
2752        # Get entity metadata, including annotations
2753        try:
2754            entity = self.synapse_entity_tracker.get(
2755                synapse_id=fileId, syn=self.syn, download_file=False
2756            )
2757            is_file = entity.concreteType.endswith(".FileEntity")
2758            is_folder = entity.concreteType.endswith(".Folder")
2759            annotations_raw = entity.annotations
2760        except SynapseHTTPError:
2761            # If an error occurs with retrieving entity, skip it
2762            # This could be caused by a temporary file view that
2763            # was deleted since its ID was retrieved
2764            is_file, is_folder = False, False
2765
2766        # Skip anything that isn't a file or folder
2767        if not (is_file or is_folder):
2768            return None
2769
2770        annotations = self.getEntityAnnotations(fileId, entity, annotations_raw)
2771
2772        return annotations
2773
2774    def getEntityAnnotations(self, fileId, entity, annotations_raw):
2775        # Extract annotations from their lists and stringify. For example:
2776        # {'YearofBirth': [1980], 'author': ['bruno', 'milen', 'sujay']}
2777        annotations = dict()
2778        for key, vals in annotations_raw.items():
2779            if isinstance(vals, list) and len(vals) == 1:
2780                annotations[key] = str(vals[0])
2781            else:
2782                annotations[key] = ", ".join(str(v) for v in vals)
2783
2784        # Add the file entity ID and eTag, which weren't lists
2785        assert fileId == entity.id, (
2786            "For some reason, the Synapse ID in the response doesn't match"
2787            "the Synapse ID sent in the request (via synapseclient)."
2788        )
2789        annotations["entityId"] = fileId
2790        annotations["eTag"] = entity.etag
2791
2792        return annotations
2793
2794    def getDatasetAnnotations(
2795        self, datasetId: str, fill_na: bool = True, force_batch: bool = False
2796    ) -> pd.DataFrame:
2797        """Generate table for annotations across all files in given dataset.
2798
2799        Args:
2800            datasetId (str): Synapse ID for dataset folder.
2801            fill_na (bool): Whether to replace missing values with
2802                blank strings.
2803            force_batch (bool): Whether to force the function to use
2804                the batch mode, which uses a file view to retrieve
2805                annotations for a given dataset. Default to False
2806                unless there are more than 50 files in the dataset.
2807
2808        Returns:
2809            pd.DataFrame: Table of annotations.
2810        """
2811        # Get all files in given dataset
2812        dataset_files = self.getFilesInStorageDataset(datasetId)
2813
2814        # if there are no dataset files, there are no annotations
2815        # return None
2816        if not dataset_files:
2817            return pd.DataFrame()
2818
2819        dataset_files_map = dict(dataset_files)
2820        dataset_file_ids, _ = list(zip(*dataset_files))
2821
2822        # Get annotations for each file from Step 1
2823        # Batch mode
2824        try_batch = len(dataset_files) >= 50 or force_batch
2825        if try_batch:
2826            try:
2827                logger.info("Trying batch mode for retrieving Synapse annotations")
2828                table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids)
2829            except (SynapseAuthenticationError, SynapseHTTPError, ValueError):
2830                logger.info(
2831                    f"Unable to create a temporary file view bound to {datasetId}. "
2832                    "Defaulting to slower iterative retrieval of annotations."
2833                )
2834                # Default to the slower non-batch method
2835                logger.info("Batch mode failed (probably due to permission error)")
2836                try_batch = False
2837
2838        # Non-batch mode
2839        if not try_batch:
2840            logger.info("Using slower (non-batch) sequential mode")
2841            records = [self.getFileAnnotations(i) for i in dataset_file_ids]
2842            # Remove any annotations for non-file/folders (stored as None)
2843            records = filter(None, records)
2844            table = pd.DataFrame.from_records(records)
2845
2846        # Add filenames for the files that "survived" annotation retrieval
2847        filenames = [dataset_files_map[i] for i in table["entityId"]]
2848
2849        if "Filename" not in table.columns:
2850            table.insert(0, "Filename", filenames)
2851
2852        # Ensure that entityId and eTag are at the end
2853        entity_ids = table.pop("entityId")
2854        etags = table.pop("eTag")
2855        table.insert(len(table.columns), "entityId", entity_ids)
2856        table.insert(len(table.columns), "eTag", etags)
2857
2858        # Missing values are filled in with empty strings for Google Sheets
2859        if fill_na:
2860            table.fillna("", inplace=True)
2861
2862        # Force all values as strings
2863        return table.astype(str)
2864
2865    def raise_final_error(retry_state):
2866        return retry_state.outcome.result()
2867
2868    def checkIfinAssetView(self, syn_id) -> str:
2869        # get data in administrative fileview for this pipeline
2870        assetViewTable = self.getStorageFileviewTable()
2871        all_files = list(assetViewTable["id"])
2872        if syn_id in all_files:
2873            return True
2874        else:
2875            return False
2876
2877    @tracer.start_as_current_span("SynapseStorage::getDatasetProject")
2878    @retry(
2879        stop=stop_after_attempt(5),
2880        wait=wait_chain(
2881            *[wait_fixed(10) for i in range(2)]
2882            + [wait_fixed(15) for i in range(2)]
2883            + [wait_fixed(20)]
2884        ),
2885        retry=retry_if_exception_type(LookupError),
2886        retry_error_callback=raise_final_error,
2887    )
2888    def getDatasetProject(self, datasetId: str) -> str:
2889        """Get parent project for a given dataset ID.
2890
2891        Args:
2892            datasetId (str): Synapse entity ID (folder or project).
2893
2894        Raises:
2895            ValueError: Raised if Synapse ID cannot be retrieved
2896            by the user or if it doesn't appear in the file view.
2897
2898        Returns:
2899            str: The Synapse ID for the parent project.
2900        """
2901
2902        # Subset main file view
2903        dataset_index = self.storageFileviewTable["id"] == datasetId
2904        dataset_row = self.storageFileviewTable[dataset_index]
2905
2906        # re-query if no datasets found
2907        if dataset_row.empty:
2908            sleep(5)
2909            self.query_fileview(force_requery=True)
2910            # Subset main file view
2911            dataset_index = self.storageFileviewTable["id"] == datasetId
2912            dataset_row = self.storageFileviewTable[dataset_index]
2913
2914        # Return `projectId` for given row if only one found
2915        if len(dataset_row) == 1:
2916            dataset_project = dataset_row["projectId"].values[0]
2917            return dataset_project
2918
2919        # Otherwise, check if already project itself
2920        try:
2921            syn_object = self.synapse_entity_tracker.get(
2922                synapse_id=datasetId, syn=self.syn, download_file=False
2923            )
2924            if syn_object.properties["concreteType"].endswith("Project"):
2925                return datasetId
2926        except SynapseHTTPError:
2927            raise PermissionError(
2928                f"The given dataset ({datasetId}) isn't accessible with this "
2929                "user. This might be caused by a typo in the dataset Synapse ID."
2930            )
2931
2932        # If not, then assume dataset not in file view
2933        raise LookupError(
2934            f"The given dataset ({datasetId}) doesn't appear in the "
2935            f"configured file view ({self.storageFileview}). This might "
2936            "mean that the file view's scope needs to be updated."
2937        )
2938
2939    def getDatasetAnnotationsBatch(
2940        self, datasetId: str, dataset_file_ids: Sequence[str] = None
2941    ) -> pd.DataFrame:
2942        """Generate table for annotations across all files in given dataset.
2943        This function uses a temporary file view to generate a table
2944        instead of iteratively querying for individual entity annotations.
2945        This function is expected to run much faster than
2946        `self.getDatasetAnnotationsBatch` on large datasets.
2947
2948        Args:
2949            datasetId (str): Synapse ID for dataset folder.
2950            dataset_file_ids (Sequence[str]): List of Synapse IDs
2951                for dataset files/folders used to subset the table.
2952
2953        Returns:
2954            pd.DataFrame: Table of annotations.
2955        """
2956        # Create data frame from annotations file view
2957        with DatasetFileView(datasetId, self.syn) as fileview:
2958            table = fileview.query()
2959
2960        if dataset_file_ids:
2961            table = table.loc[table.index.intersection(dataset_file_ids)]
2962
2963        table = table.reset_index(drop=True)
2964
2965        return table
2966
2967    def _get_table_schema_by_cname(self, table_schema):
2968        # assume no duplicate column names in the table
2969        table_schema_by_cname = {}
2970
2971        for col_record in table_schema:
2972            # TODO clean up dictionary for compactness (e.g. remove redundant 'name' key)
2973            table_schema_by_cname[col_record["name"]] = col_record
2974
2975        return table_schema_by_cname

Implementation of Storage interface for datasets/files stored on Synapse. Provides utilities to list files in a specific project; update files annotations, create fileviews, etc.

TODO: Need to define the interface and rename and/or refactor some of the methods below.

@tracer.start_as_current_span('SynapseStorage::__init__')
SynapseStorage( token: Optional[str] = None, access_token: Optional[str] = None, project_scope: Optional[list] = None, synapse_cache_path: Optional[str] = None, perform_query: Optional[bool] = True, columns: Optional[list] = None, where_clauses: Optional[list] = None)
297    @tracer.start_as_current_span("SynapseStorage::__init__")
298    def __init__(
299        self,
300        token: Optional[str] = None,  # optional parameter retrieved from browser cookie
301        access_token: Optional[str] = None,
302        project_scope: Optional[list] = None,
303        synapse_cache_path: Optional[str] = None,
304        perform_query: Optional[bool] = True,
305        columns: Optional[list] = None,
306        where_clauses: Optional[list] = None,
307    ) -> None:
308        """Initializes a SynapseStorage object.
309
310        Args:
311            token (Optional[str], optional):
312              Optional token parameter as found in browser cookie upon login to synapse.
313              Defaults to None.
314            access_token (Optional[list], optional):
315              Optional access token (personal or oauth).
316              Defaults to None.
317            project_scope (Optional[list], optional): Defaults to None.
318            synapse_cache_path (Optional[str], optional):
319              Location of synapse cache.
320              Defaults to None.
321        TODO:
322            Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how `query_fileview` is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands.
323        """
324        self.syn = self.login(synapse_cache_path, access_token)
325        current_span = trace.get_current_span()
326        if current_span.is_recording():
327            current_span.set_attribute("user.id", self.syn.credentials.owner_id)
328        self.project_scope = project_scope
329        self.storageFileview = CONFIG.synapse_master_fileview_id
330        self.manifest = CONFIG.synapse_manifest_basename
331        self.root_synapse_cache = self.syn.cache.cache_root_dir
332        self.synapse_entity_tracker = SynapseEntityTracker()
333        if perform_query:
334            self.query_fileview(columns=columns, where_clauses=where_clauses)

Initializes a SynapseStorage object.

Arguments:
  • token (Optional[str], optional): Optional token parameter as found in browser cookie upon login to synapse. Defaults to None.
  • access_token (Optional[list], optional): Optional access token (personal or oauth). Defaults to None.
  • project_scope (Optional[list], optional): Defaults to None.
  • synapse_cache_path (Optional[str], optional): Location of synapse cache. Defaults to None.
TODO:

Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how query_fileview is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands.

syn
project_scope
storageFileview
manifest
root_synapse_cache
synapse_entity_tracker
@tracer.start_as_current_span('SynapseStorage::query_fileview')
def query_fileview( self, columns: Optional[list] = None, where_clauses: Optional[list] = None, force_requery: Optional[bool] = False) -> None:
373    @tracer.start_as_current_span("SynapseStorage::query_fileview")
374    def query_fileview(
375        self,
376        columns: Optional[list] = None,
377        where_clauses: Optional[list] = None,
378        force_requery: Optional[bool] = False,
379    ) -> None:
380        """
381        Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute.
382        Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes.
383        Args:
384            columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
385            where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
386            force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False.
387        """
388        self._purge_synapse_cache()
389
390        # Initialize to assume that the new fileview query will be different from what may already be stored. Initializes to True because generally one will not have already been performed
391        self.new_query_different = True
392
393        # If a query has already been performed, store the query
394        previous_query_built = hasattr(self, "fileview_query")
395        if previous_query_built:
396            previous_query = self.fileview_query
397
398        # Build a query with the current given parameters and check to see if it is different from the previous
399        self._build_query(columns=columns, where_clauses=where_clauses)
400        if previous_query_built:
401            self.new_query_different = self.fileview_query != previous_query
402
403        # Only perform the query if it is different from the previous query or we are forcing new results to be retrieved
404        if self.new_query_different or force_requery:
405            try:
406                self.storageFileviewTable = self.syn.tableQuery(
407                    query=self.fileview_query,
408                ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
409            except SynapseHTTPError as exc:
410                exception_text = str(exc)
411                if "Unknown column path" in exception_text:
412                    raise ValueError(
413                        "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation."
414                    )
415                elif "Unknown column" in exception_text:
416                    missing_column = exception_text.split("Unknown column ")[-1]
417                    raise ValueError(
418                        f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview."
419                    )
420                else:
421                    raise AccessCredentialsError(self.storageFileview)

Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute. Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes.

Arguments:
  • columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
  • where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
  • force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False.
@staticmethod
def build_clause_from_dataset_id( dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None) -> str:
423    @staticmethod
424    def build_clause_from_dataset_id(
425        dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None
426    ) -> str:
427        """
428        Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized.
429        Args:
430            dataset_id: Synapse ID of a dataset that should be used to limit the query
431            dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query
432        Returns:
433            clause for the query or an empty string if no dataset ID is provided
434        """
435        # Calling this method without specifying synIDs will complete but will not scope the view
436        if (not dataset_id) and (not dataset_folder_list):
437            return ""
438
439        # This will be used to gather files under a dataset recursively with a fileview query instead of walking
440        if dataset_folder_list:
441            search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list)
442            return f"parentId IN ({search_folders})"
443
444        # `dataset_id` should be provided when all files are stored directly under the dataset folder
445        return f"parentId='{dataset_id}'"

Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized.

Arguments:
  • dataset_id: Synapse ID of a dataset that should be used to limit the query
  • dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query
Returns:

clause for the query or an empty string if no dataset ID is provided

@staticmethod
@tracer.start_as_current_span('SynapseStorage::login')
def login( synapse_cache_path: Optional[str] = None, access_token: Optional[str] = None) -> synapseclient.client.Synapse:
485    @staticmethod
486    @tracer.start_as_current_span("SynapseStorage::login")
487    def login(
488        synapse_cache_path: Optional[str] = None,
489        access_token: Optional[str] = None,
490    ) -> synapseclient.Synapse:
491        """Login to Synapse
492
493        Args:
494            access_token (Optional[str], optional): A synapse access token. Defaults to None.
495            synapse_cache_path (Optional[str]): location of synapse cache
496
497        Raises:
498            ValueError: If unable to loging with access token
499
500        Returns:
501            synapseclient.Synapse: A Synapse object that is logged in
502        """
503        # If no token is provided, try retrieving access token from environment
504        if not access_token:
505            access_token = os.getenv("SYNAPSE_ACCESS_TOKEN")
506
507        # login using a token
508        if access_token:
509            try:
510                syn = synapseclient.Synapse(
511                    cache_root_dir=synapse_cache_path,
512                    debug=False,
513                    skip_checks=True,
514                    cache_client=False,
515                )
516                syn.login(authToken=access_token, silent=True)
517                current_span = trace.get_current_span()
518                if current_span.is_recording():
519                    current_span.set_attribute("user.id", syn.credentials.owner_id)
520            except SynapseHTTPError as exc:
521                raise ValueError(
522                    "No access to resources. Please make sure that your token is correct"
523                ) from exc
524        else:
525            # login using synapse credentials provided by user in .synapseConfig (default) file
526            syn = synapseclient.Synapse(
527                configPath=CONFIG.synapse_configuration_path,
528                cache_root_dir=synapse_cache_path,
529                debug=False,
530                skip_checks=True,
531                cache_client=False,
532            )
533            syn.login(silent=True)
534            current_span = trace.get_current_span()
535            if current_span.is_recording():
536                current_span.set_attribute("user.id", syn.credentials.owner_id)
537        return syn

Login to Synapse

Arguments:
  • access_token (Optional[str], optional): A synapse access token. Defaults to None.
  • synapse_cache_path (Optional[str]): location of synapse cache
Raises:
  • ValueError: If unable to loging with access token
Returns:

synapseclient.Synapse: A Synapse object that is logged in

def missing_entity_handler(method):
539    def missing_entity_handler(method):
540        def wrapper(*args, **kwargs):
541            try:
542                return method(*args, **kwargs)
543            except SynapseHTTPError as ex:
544                str_message = str(ex).replace("\n", "")
545                if "trash" in str_message or "does not exist" in str_message:
546                    logging.warning(str_message)
547                    return None
548                else:
549                    raise ex
550
551        return wrapper
def async_missing_entity_handler(method):
553    def async_missing_entity_handler(method):
554        """Decorator to handle missing entities in async methods."""
555
556        async def wrapper(*args: Any, **kwargs: Any) -> Any:
557            try:
558                return await method(*args, **kwargs)
559            except SynapseHTTPError as ex:
560                str_message = str(ex).replace("\n", "")
561                if "trash" in str_message or "does not exist" in str_message:
562                    logging.warning(str_message)
563                    return None
564                else:
565                    raise ex
566
567        return wrapper

Decorator to handle missing entities in async methods.

def getStorageFileviewTable(self):
569    def getStorageFileviewTable(self):
570        """Returns the storageFileviewTable obtained during initialization."""
571        return self.storageFileviewTable

Returns the storageFileviewTable obtained during initialization.

def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]:
573    def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]:
574        """Gets the paginated results of the REST call to Synapse to check what projects the current user has access to.
575
576        Args:
577            currentUserId: synapse id for the user whose projects we want to get.
578
579        Returns:
580            A dictionary with a next page token and the results.
581        """
582        all_results = self.syn.restGET(
583            "/projects/user/{principalId}".format(principalId=currentUserId)
584        )
585
586        while (
587            "nextPageToken" in all_results
588        ):  # iterate over next page token in results while there is any
589            results_token = self.syn.restGET(
590                "/projects/user/{principalId}?nextPageToken={nextPageToken}".format(
591                    principalId=currentUserId,
592                    nextPageToken=all_results["nextPageToken"],
593                )
594            )
595            all_results["results"].extend(results_token["results"])
596
597            if "nextPageToken" in results_token:
598                all_results["nextPageToken"] = results_token["nextPageToken"]
599            else:
600                del all_results["nextPageToken"]
601
602        return all_results

Gets the paginated results of the REST call to Synapse to check what projects the current user has access to.

Arguments:
  • currentUserId: synapse id for the user whose projects we want to get.
Returns:

A dictionary with a next page token and the results.

@tracer.start_as_current_span('SynapseStorage::getStorageProjects')
def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]:
604    @tracer.start_as_current_span("SynapseStorage::getStorageProjects")
605    def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]:
606        """Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute.
607
608        Returns:
609            A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName).
610        """
611
612        # get the set of all storage Synapse project accessible for this pipeline
613        storageProjects = self.storageFileviewTable["projectId"].unique()
614
615        # get the set of storage Synapse project accessible for this user
616        # get a list of projects from Synapse
617        current_user_project_headers = self.synapse_entity_tracker.get_project_headers(
618            current_user_id=self.syn.credentials.owner_id, syn=self.syn
619        )
620        project_id_to_name_dict = {}
621        current_user_projects = []
622        for project_header in current_user_project_headers:
623            project_id_to_name_dict[project_header.get("id")] = project_header.get(
624                "name"
625            )
626            current_user_projects.append(project_header.get("id"))
627
628        # find set of user projects that are also in this pipeline's storage projects set
629        storageProjects = list(set(storageProjects) & set(current_user_projects))
630
631        # Limit projects to scope if specified
632        if project_scope:
633            storageProjects = list(set(storageProjects) & set(project_scope))
634
635            if not storageProjects:
636                raise Warning(
637                    f"There are no projects that the user has access to that match the criteria of the specified project scope: {project_scope}"
638                )
639
640        # prepare a return list of project IDs and names
641        projects = []
642        for projectId in storageProjects:
643            project_name_from_project_header = project_id_to_name_dict.get(projectId)
644            projects.append((projectId, project_name_from_project_header))
645
646        sorted_projects_list = sorted(projects, key=lambda tup: tup[0])
647
648        return sorted_projects_list

Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute.

Returns:

A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName).

@tracer.start_as_current_span('SynapseStorage::getStorageDatasetsInProject')
def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]:
650    @tracer.start_as_current_span("SynapseStorage::getStorageDatasetsInProject")
651    def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]:
652        """Gets all datasets in folder under a given storage project that the current user has access to.
653
654        Args:
655            projectId: synapse ID of a storage project.
656
657        Returns:
658            A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName).
659            None: If the projectId cannot be found on Synapse.
660        """
661
662        # select all folders and fetch their names from within the storage project;
663        # if folder content type is defined, only select folders that contain datasets
664        if "contentType" in self.storageFileviewTable.columns:
665            foldersTable = self.storageFileviewTable[
666                (self.storageFileviewTable["contentType"] == "dataset")
667                & (self.storageFileviewTable["projectId"] == projectId)
668            ]
669        else:
670            foldersTable = self.storageFileviewTable[
671                (self.storageFileviewTable["type"] == "folder")
672                & (self.storageFileviewTable["parentId"] == projectId)
673            ]
674
675        # get an array of tuples (folderId, folderName)
676        # some folders are part of datasets; others contain datasets
677        # each dataset parent is the project; folders part of a dataset have another folder as a parent
678        # to get folders if and only if they contain datasets for each folder
679        # check if folder's parent is the project; if so that folder contains a dataset,
680        # unless the folder list has already been filtered to dataset folders based on contentType attribute above
681
682        datasetList = []
683        folderProperties = ["id", "name"]
684        for folder in list(
685            foldersTable[folderProperties].itertuples(index=False, name=None)
686        ):
687            datasetList.append(folder)
688
689        sorted_dataset_list = sorted(datasetList, key=lambda tup: tup[0])
690
691        return sorted_dataset_list

Gets all datasets in folder under a given storage project that the current user has access to.

Arguments:
  • projectId: synapse ID of a storage project.
Returns:

A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName). None: If the projectId cannot be found on Synapse.

@tracer.start_as_current_span('SynapseStorage::getFilesInStorageDataset')
def getFilesInStorageDataset( self, datasetId: str, fileNames: List = None, fullpath: bool = True) -> List[Tuple[str, str]]:
693    @tracer.start_as_current_span("SynapseStorage::getFilesInStorageDataset")
694    def getFilesInStorageDataset(
695        self, datasetId: str, fileNames: List = None, fullpath: bool = True
696    ) -> List[Tuple[str, str]]:
697        """Gets all files (excluding manifest files) in a given dataset folder.
698
699        Args:
700            datasetId: synapse ID of a storage dataset.
701            fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g.
702            metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present.
703            fullpath: if True return the full path as part of this filename; otherwise return just base filename
704
705        Returns:
706            A list of files; the list consists of tuples (fileId, fileName).
707
708        Raises:
709            ValueError: Dataset ID not found.
710        """
711        file_list = []
712
713        # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view
714        if self.storageFileviewTable.empty:
715            raise ValueError(
716                f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again."
717            )
718
719        child_path = self.storageFileviewTable.loc[
720            self.storageFileviewTable["parentId"] == datasetId, "path"
721        ]
722        if child_path.empty:
723            raise LookupError(
724                f"Dataset {datasetId} could not be found in fileview {self.storageFileview}."
725            )
726        child_path = child_path.iloc[0]
727
728        # Get the dataset path by eliminating the child's portion of the path to account for nested datasets
729        parent = child_path.split("/")[:-1]
730        parent = "/".join(parent)
731
732        # Format dataset path to be used in table query
733        dataset_path = f"'{parent}/%'"
734
735        # When querying, only include files to exclude entity files and subdirectories
736        where_clauses = [f"path like {dataset_path}", "type='file'"]
737
738        # Requery the fileview to specifically get the files in the given dataset
739        self.query_fileview(columns=["id", "path"], where_clauses=where_clauses)
740
741        # Exclude manifest files
742        non_manifest_files = self.storageFileviewTable.loc[
743            ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"),
744            :,
745        ]
746
747        # Remove all files that are not in the list of fileNames
748        if fileNames:
749            filename_regex = "|".join(fileNames)
750
751            matching_files = non_manifest_files["path"].str.contains(
752                filename_regex, case=False, regex=True
753            )
754
755            non_manifest_files = non_manifest_files.loc[matching_files, :]
756
757        # Truncate path if necessary
758        if not fullpath:
759            non_manifest_files.path = non_manifest_files.path.apply(os.path.basename)
760
761        # Return list of files as expected by other methods
762        file_list = list(non_manifest_files.itertuples(index=False, name=None))
763
764        return file_list

Gets all files (excluding manifest files) in a given dataset folder.

Arguments:
  • datasetId: synapse ID of a storage dataset.
  • fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g.
  • metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present.
  • fullpath: if True return the full path as part of this filename; otherwise return just base filename
Returns:

A list of files; the list consists of tuples (fileId, fileName).

Raises:
  • ValueError: Dataset ID not found.
@tracer.start_as_current_span('SynapseStorage::getDatasetManifest')
def getDatasetManifest( self, datasetId: str, downloadFile: bool = False, newManifestName: str = '', use_temporary_folder: bool = True) -> Union[str, synapseclient.entity.File]:
791    @tracer.start_as_current_span("SynapseStorage::getDatasetManifest")
792    def getDatasetManifest(
793        self,
794        datasetId: str,
795        downloadFile: bool = False,
796        newManifestName: str = "",
797        use_temporary_folder: bool = True,
798    ) -> Union[str, File]:
799        """Gets the manifest associated with a given dataset.
800
801        Args:
802            datasetId: synapse ID of a storage dataset.
803            downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not.
804            newManifestName: new name of a manifest that gets downloaded
805            use_temporary_folder: boolean argument indicating if a temporary folder
806                should be used to store the manifest file. This is useful when running
807                this code as an API server where multiple requests could be made at the
808                same time. This is set to False when the code is being used from the
809                CLI. Defaults to True.
810
811        Returns:
812            manifest_syn_id (String): Synapse ID of exisiting manifest file.
813            manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True.
814            "" (String): No pre-exisiting manifest in dataset.
815        """
816        manifest_data = ""
817
818        # get a list of files containing the manifest for this dataset (if any)
819        all_files = self.storageFileviewTable
820
821        # construct regex based on manifest basename in the config
822        manifest_re = re.compile(os.path.basename(self.manifest) + ".*.[tc]sv")
823
824        # search manifest based on given manifest basename regex above
825        # and return a dataframe containing name and id of manifests in a given asset view
826        manifest = all_files[
827            (all_files["name"].str.contains(manifest_re, regex=True))
828            & (all_files["parentId"] == datasetId)
829        ]
830
831        manifest = manifest[["id", "name"]]
832
833        # if there is no pre-exisiting manifest in the specified dataset
834        if manifest.empty:
835            logger.warning(
836                f"Could not find a manifest that fits basename {self.manifest} in asset view and dataset {datasetId}"
837            )
838            return ""
839
840        # if there is an exisiting manifest
841        else:
842            manifest_syn_id = self._get_manifest_id(manifest)
843            if downloadFile:
844                md = ManifestDownload(
845                    self.syn,
846                    manifest_id=manifest_syn_id,
847                    synapse_entity_tracker=self.synapse_entity_tracker,
848                )
849                manifest_data = md.download_manifest(
850                    newManifestName=newManifestName,
851                    manifest_df=manifest,
852                    use_temporary_folder=use_temporary_folder,
853                )
854                # TO DO: revisit how downstream code handle manifest_data. If the downstream code would break when manifest_data is an empty string,
855                # then we should catch the error here without returning an empty string.
856                if not manifest_data:
857                    logger.debug(
858                        f"No manifest data returned. Please check if you have successfully downloaded manifest: {manifest_syn_id}"
859                    )
860                return manifest_data
861            return manifest_syn_id

Gets the manifest associated with a given dataset.

Arguments:
  • datasetId: synapse ID of a storage dataset.
  • downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not.
  • newManifestName: new name of a manifest that gets downloaded
  • use_temporary_folder: boolean argument indicating if a temporary folder should be used to store the manifest file. This is useful when running this code as an API server where multiple requests could be made at the same time. This is set to False when the code is being used from the CLI. Defaults to True.
Returns:

manifest_syn_id (String): Synapse ID of exisiting manifest file. manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True. "" (String): No pre-exisiting manifest in dataset.

def getDataTypeFromManifest(self, manifestId: str):
863    def getDataTypeFromManifest(self, manifestId: str):
864        """Fetch a manifest and return data types of all columns
865        Args:
866            manifestId: synapse ID of a manifest
867        """
868        # get manifest file path
869        manifest_entity = self.synapse_entity_tracker.get(
870            synapse_id=manifestId, syn=self.syn, download_file=True
871        )
872        manifest_filepath = manifest_entity.path
873
874        # load manifest dataframe
875        manifest = load_df(
876            manifest_filepath,
877            preserve_raw_input=False,
878            data_model=False,
879        )
880
881        # convert the dataFrame to use best possible dtypes.
882        manifest_new = manifest.convert_dtypes()
883
884        # get data types of columns
885        result = manifest_new.dtypes.to_frame("dtypes").reset_index()
886
887        # return the result as a dictionary
888        result_dict = result.set_index("index")["dtypes"].astype(str).to_dict()
889
890        return result_dict

Fetch a manifest and return data types of all columns

Arguments:
  • manifestId: synapse ID of a manifest
def add_entity_id_and_filename( self, datasetId: str, manifest: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame:
914    def add_entity_id_and_filename(
915        self, datasetId: str, manifest: pd.DataFrame
916    ) -> pd.DataFrame:
917        """add entityid and filename column to an existing manifest assuming entityId column is not already present
918
919        Args:
920            datasetId (str): dataset syn id
921            manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty
922
923        Returns:
924            pd.DataFrame: returns a pandas dataframe
925        """
926        # get file names and entity ids of a given dataset
927        dataset_files_dict = self._get_files_metadata_from_dataset(
928            datasetId, only_new_files=False
929        )
930
931        if dataset_files_dict:
932            # turn manifest dataframe back to a dictionary for operation
933            manifest_dict = manifest.to_dict("list")
934
935            # update Filename column
936            # add entityId column to the end
937            manifest_dict.update(dataset_files_dict)
938
939            # if the component column exists in existing manifest, fill up that column
940            if "Component" in manifest_dict.keys():
941                manifest_dict["Component"] = manifest_dict["Component"] * max(
942                    1, len(manifest_dict["Filename"])
943                )
944
945            # turn dictionary back to a dataframe
946            manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient="index")
947            manifest_df_updated = manifest_df_index.transpose()
948
949            # fill na with empty string
950            manifest_df_updated = manifest_df_updated.fillna("")
951
952            # drop index
953            manifest_df_updated = manifest_df_updated.reset_index(drop=True)
954
955            return manifest_df_updated
956        else:
957            return manifest

add entityid and filename column to an existing manifest assuming entityId column is not already present

Arguments:
  • datasetId (str): dataset syn id
  • manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty
Returns:

pd.DataFrame: returns a pandas dataframe

def fill_in_entity_id_filename( self, datasetId: str, manifest: pandas.core.frame.DataFrame) -> Tuple[List, pandas.core.frame.DataFrame]:
 959    def fill_in_entity_id_filename(
 960        self, datasetId: str, manifest: pd.DataFrame
 961    ) -> Tuple[List, pd.DataFrame]:
 962        """fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present.
 963
 964        Args:
 965            datasetId (str): dataset syn id
 966            manifest (pd.DataFrame): existing manifest dataframe.
 967
 968        Returns:
 969            Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe
 970        """
 971        # get dataset file names and entity id as a list of tuple
 972        dataset_files = self.getFilesInStorageDataset(datasetId)
 973
 974        # update manifest with additional filenames, if any
 975        # note that if there is an existing manifest and there are files in the dataset
 976        # the columns Filename and entityId are assumed to be present in manifest schema
 977        # TODO: use idiomatic panda syntax
 978        if not dataset_files:
 979            manifest = manifest.fillna("")
 980            return dataset_files, manifest
 981
 982        all_files = self._get_file_entityIds(
 983            dataset_files=dataset_files, only_new_files=False, manifest=manifest
 984        )
 985        new_files = self._get_file_entityIds(
 986            dataset_files=dataset_files, only_new_files=True, manifest=manifest
 987        )
 988
 989        all_files = pd.DataFrame(all_files)
 990        new_files = pd.DataFrame(new_files)
 991
 992        # update manifest so that it contains new dataset files
 993        manifest = (
 994            pd.concat([manifest, new_files], sort=False)
 995            .reset_index()
 996            .drop("index", axis=1)
 997        )
 998
 999        # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata
1000        manifest_reindex = manifest.set_index("entityId")
1001        all_files_reindex = all_files.set_index("entityId")
1002        all_files_reindex_like_manifest = all_files_reindex.reindex_like(
1003            manifest_reindex
1004        )
1005
1006        # Check if individual file paths in manifest and from synapse match
1007        file_paths_match = (
1008            manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"]
1009        )
1010
1011        # If all the paths do not match, update the manifest with the filepaths from synapse
1012        if not file_paths_match.all():
1013            manifest_reindex.loc[
1014                ~file_paths_match, "Filename"
1015            ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"]
1016
1017            # reformat manifest for further use
1018            manifest = manifest_reindex.reset_index()
1019            entityIdCol = manifest.pop("entityId")
1020            manifest.insert(len(manifest.columns), "entityId", entityIdCol)
1021
1022        manifest = manifest.fillna("")
1023        return dataset_files, manifest

fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present.

Arguments:
  • datasetId (str): dataset syn id
  • manifest (pd.DataFrame): existing manifest dataframe.
Returns:

Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe

@tracer.start_as_current_span('SynapseStorage::updateDatasetManifestFiles')
def updateDatasetManifestFiles( self, dmge: schematic.schemas.data_model_graph.DataModelGraphExplorer, datasetId: str, store: bool = True) -> Optional[Tuple[str, pandas.core.frame.DataFrame]]:
1025    @tracer.start_as_current_span("SynapseStorage::updateDatasetManifestFiles")
1026    def updateDatasetManifestFiles(
1027        self, dmge: DataModelGraphExplorer, datasetId: str, store: bool = True
1028    ) -> Union[Tuple[str, pd.DataFrame], None]:
1029        """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any.
1030
1031        Args:
1032            dmge: DataModelGraphExplorer Instance
1033            datasetId: synapse ID of a storage dataset.
1034            store: if set to True store updated manifest in asset store; if set to False
1035            return a Pandas dataframe containing updated manifest but do not store to asset store
1036
1037
1038        Returns:
1039            Synapse ID of updated manifest and Pandas dataframe containing the updated manifest.
1040            If there is no existing manifest or if the manifest does not have an entityId column, return None
1041        """
1042
1043        # get existing manifest Synapse ID
1044        manifest_id = self.getDatasetManifest(datasetId)
1045
1046        # if there is no manifest return None
1047        if not manifest_id:
1048            return None
1049
1050        manifest_entity = self.synapse_entity_tracker.get(
1051            synapse_id=manifest_id, syn=self.syn, download_file=True
1052        )
1053        manifest_filepath = manifest_entity.path
1054        manifest = load_df(manifest_filepath)
1055
1056        # If the manifest does not have an entityId column, trigger a new manifest to be generated
1057        if "entityId" not in manifest.columns:
1058            return None
1059
1060        manifest_is_file_based = "Filename" in manifest.columns
1061
1062        if manifest_is_file_based:
1063            # update manifest with additional filenames, if any
1064            # note that if there is an existing manifest and there are files in the dataset
1065            # the columns Filename and entityId are assumed to be present in manifest schema
1066            # TODO: use idiomatic panda syntax
1067            dataset_files, manifest = self.fill_in_entity_id_filename(
1068                datasetId, manifest
1069            )
1070            if dataset_files:
1071                # update the manifest file, so that it contains the relevant entity IDs
1072                if store:
1073                    manifest.to_csv(manifest_filepath, index=False)
1074
1075                    # store manifest and update associated metadata with manifest on Synapse
1076                    manifest_id = self.associateMetadataWithFiles(
1077                        dmge, manifest_filepath, datasetId
1078                    )
1079
1080        return manifest_id, manifest

Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any.

Arguments:
  • dmge: DataModelGraphExplorer Instance
  • datasetId: synapse ID of a storage dataset.
  • store: if set to True store updated manifest in asset store; if set to False
  • return a Pandas dataframe containing updated manifest but do not store to asset store
Returns:

Synapse ID of updated manifest and Pandas dataframe containing the updated manifest. If there is no existing manifest or if the manifest does not have an entityId column, return None

@tracer.start_as_current_span('SynapseStorage::getProjectManifests')
def getProjectManifests( self, projectId: str) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]:
1126    @tracer.start_as_current_span("SynapseStorage::getProjectManifests")
1127    def getProjectManifests(
1128        self, projectId: str
1129    ) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]:
1130        """Gets all metadata manifest files across all datasets in a specified project.
1131
1132        Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest
1133                 as a list of tuples, one for each manifest:
1134                    [
1135                        (
1136                            (datasetId, dataName),
1137                            (manifestId, manifestName),
1138                            (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema
1139                        ),
1140                        ...
1141                    ]
1142
1143        TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface
1144        """
1145        component = None
1146        entity = None
1147        manifests = []
1148
1149        datasets = self.getStorageDatasetsInProject(projectId)
1150
1151        for datasetId, datasetName in datasets:
1152            # encode information about the manifest in a simple list (so that R clients can unpack it)
1153            # eventually can serialize differently
1154
1155            # Get synID of manifest for a dataset
1156            manifestId = self.getDatasetManifest(datasetId)
1157
1158            # If a manifest exists, get the annotations for it, else return base 'manifest' tuple
1159            if manifestId:
1160                annotations = self.getFileAnnotations(manifestId)
1161
1162                # If manifest has annotations specifying component, use that
1163                if annotations and "Component" in annotations:
1164                    component = annotations["Component"]
1165                    entity = self.synapse_entity_tracker.get(
1166                        synapse_id=manifestId, syn=self.syn, download_file=False
1167                    )
1168                    manifest_name = entity["properties"]["name"]
1169
1170                # otherwise download the manifest and parse for information
1171                elif not annotations or "Component" not in annotations:
1172                    logging.debug(
1173                        f"No component annotations have been found for manifest {manifestId}. "
1174                        "The manifest will be downloaded and parsed instead. "
1175                        "For increased speed, add component annotations to manifest."
1176                    )
1177
1178                    manifest_info = self.getDatasetManifest(
1179                        datasetId, downloadFile=True
1180                    )
1181                    manifest_name = manifest_info["properties"].get("name", "")
1182
1183                    if not manifest_name:
1184                        logger.error(f"Failed to download manifests from {datasetId}")
1185
1186                    manifest_path = manifest_info["path"]
1187
1188                    manifest_df = load_df(manifest_path)
1189
1190                    # Get component from component column if it exists
1191                    if (
1192                        "Component" in manifest_df
1193                        and not manifest_df["Component"].empty
1194                    ):
1195                        list(set(manifest_df["Component"]))
1196                        component = list(set(manifest_df["Component"]))
1197
1198                        # Added to address issues raised during DCA testing
1199                        if "" in component:
1200                            component.remove("")
1201
1202                        if len(component) == 1:
1203                            component = component[0]
1204                        elif len(component) > 1:
1205                            logging.warning(
1206                                f"Manifest {manifestId} is composed of multiple components. Schematic does not support mulit-component manifests at this time."
1207                                "Behavior of manifests with multiple components is undefined"
1208                            )
1209            else:
1210                manifest_name = ""
1211                component = None
1212            if component:
1213                manifest = (
1214                    (datasetId, datasetName),
1215                    (manifestId, manifest_name),
1216                    (component, component),
1217                )
1218            elif manifestId:
1219                logging.debug(
1220                    f"Manifest {manifestId} does not have an associated Component"
1221                )
1222                manifest = (
1223                    (datasetId, datasetName),
1224                    (manifestId, manifest_name),
1225                    ("", ""),
1226                )
1227            else:
1228                manifest = (
1229                    (datasetId, datasetName),
1230                    ("", ""),
1231                    ("", ""),
1232                )
1233
1234            if manifest:
1235                manifests.append(manifest)
1236
1237        return manifests

Gets all metadata manifest files across all datasets in a specified project.

Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest as a list of tuples, one for each manifest: [ ( (datasetId, dataName), (manifestId, manifestName), (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema ), ... ]

TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface

def upload_project_manifests_to_synapse( self, dmge: schematic.schemas.data_model_graph.DataModelGraphExplorer, projectId: str) -> List[str]:
1239    def upload_project_manifests_to_synapse(
1240        self, dmge: DataModelGraphExplorer, projectId: str
1241    ) -> List[str]:
1242        """Upload all metadata manifest files across all datasets in a specified project as tables in Synapse.
1243
1244        Returns: String of all the manifest_table_ids of all the manifests that have been loaded.
1245        """
1246
1247        manifests = []
1248        manifest_loaded = []
1249        datasets = self.getStorageDatasetsInProject(projectId)
1250
1251        for datasetId, datasetName in datasets:
1252            # encode information about the manifest in a simple list (so that R clients can unpack it)
1253            # eventually can serialize differently
1254
1255            manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1256
1257            manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1258            if manifest_info:
1259                manifest_id = manifest_info["properties"]["id"]
1260                manifest_name = manifest_info["properties"]["name"]
1261                manifest_path = manifest_info["path"]
1262                manifest_df = load_df(manifest_path)
1263                manifest_table_id = uploadDB(
1264                    dmge=dmge,
1265                    manifest=manifest,
1266                    datasetId=datasetId,
1267                    table_name=datasetName,
1268                )
1269                manifest_loaded.append(datasetName)
1270        return manifest_loaded

Upload all metadata manifest files across all datasets in a specified project as tables in Synapse.

Returns: String of all the manifest_table_ids of all the manifests that have been loaded.

def upload_annotated_project_manifests_to_synapse( self, projectId: str, path_to_json_ld: str, dry_run: bool = False) -> List[str]:
1272    def upload_annotated_project_manifests_to_synapse(
1273        self, projectId: str, path_to_json_ld: str, dry_run: bool = False
1274    ) -> List[str]:
1275        """
1276        Purpose:
1277            For all manifests in a project, upload them as a table and add annotations manifest csv.
1278            Assumes the manifest is already present as a CSV in a dataset in the project.
1279
1280        """
1281        # Instantiate DataModelParser
1282        data_model_parser = DataModelParser(path_to_data_model=path_to_json_ld)
1283        # Parse Model
1284        parsed_data_model = data_model_parser.parse_model()
1285
1286        # Instantiate DataModelGraph
1287        data_model_grapher = DataModelGraph(parsed_data_model)
1288
1289        # Generate graph
1290        graph_data_model = data_model_grapher.generate_data_model_graph()
1291
1292        # Instantiate DataModelGraphExplorer
1293        dmge = DataModelGraphExplorer(graph_data_model)
1294
1295        manifests = []
1296        manifest_loaded = []
1297        datasets = self.getStorageDatasetsInProject(projectId)
1298        for datasetId, datasetName in datasets:
1299            # encode information about the manifest in a simple list (so that R clients can unpack it)
1300            # eventually can serialize differently
1301
1302            manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1303            manifests.append(manifest)
1304
1305            manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1306
1307            if manifest_info:
1308                manifest_id = manifest_info["properties"]["id"]
1309                manifest_name = manifest_info["properties"]["name"]
1310                manifest_path = manifest_info["path"]
1311                manifest = (
1312                    (datasetId, datasetName),
1313                    (manifest_id, manifest_name),
1314                    ("", ""),
1315                )
1316                if not dry_run:
1317                    self.associateMetadataWithFiles(
1318                        dmge, manifest_path, datasetId, manifest_record_type="table"
1319                    )
1320                manifest_loaded.append(manifest)
1321
1322        return manifests, manifest_loaded
Purpose:

For all manifests in a project, upload them as a table and add annotations manifest csv. Assumes the manifest is already present as a CSV in a dataset in the project.

def move_entities_to_new_project( self, projectId: str, newProjectId: str, returnEntities: bool = False, dry_run: bool = False):
1324    def move_entities_to_new_project(
1325        self,
1326        projectId: str,
1327        newProjectId: str,
1328        returnEntities: bool = False,
1329        dry_run: bool = False,
1330    ):
1331        """
1332        For each manifest csv in a project, look for all the entitiy ids that are associated.
1333        Look up the entitiy in the files, move the entity to new project.
1334        """
1335
1336        manifests = []
1337        manifest_loaded = []
1338        datasets = self.getStorageDatasetsInProject(projectId)
1339        if datasets:
1340            for datasetId, datasetName in datasets:
1341                # encode information about the manifest in a simple list (so that R clients can unpack it)
1342                # eventually can serialize differently
1343
1344                manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1345                manifests.append(manifest)
1346
1347                manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1348                if manifest_info:
1349                    manifest_id = manifest_info["properties"]["id"]
1350                    manifest_name = manifest_info["properties"]["name"]
1351                    manifest_path = manifest_info["path"]
1352                    manifest_df = load_df(manifest_path)
1353
1354                    manifest = (
1355                        (datasetId, datasetName),
1356                        (manifest_id, manifest_name),
1357                        ("", ""),
1358                    )
1359                    manifest_loaded.append(manifest)
1360
1361                    annotation_entities = self.storageFileviewTable[
1362                        (self.storageFileviewTable["id"].isin(manifest_df["entityId"]))
1363                        & (self.storageFileviewTable["type"] == "folder")
1364                    ]["id"]
1365
1366                    if returnEntities:
1367                        for entityId in annotation_entities:
1368                            if not dry_run:
1369                                moved_entity = self.syn.move(entityId, datasetId)
1370                                self.synapse_entity_tracker.add(
1371                                    synapse_id=moved_entity.id, entity=moved_entity
1372                                )
1373                            else:
1374                                logging.info(
1375                                    f"{entityId} will be moved to folder {datasetId}."
1376                                )
1377                    else:
1378                        # generate project folder
1379                        archive_project_folder = Folder(
1380                            projectId + "_archive", parent=newProjectId
1381                        )
1382                        archive_project_folder = self.syn.store(archive_project_folder)
1383                        self.synapse_entity_tracker.add(
1384                            synapse_id=archive_project_folder.id,
1385                            entity=archive_project_folder,
1386                        )
1387
1388                        # generate dataset folder
1389                        dataset_archive_folder = Folder(
1390                            "_".join([datasetId, datasetName, "archive"]),
1391                            parent=archive_project_folder.id,
1392                        )
1393                        dataset_archive_folder = self.syn.store(dataset_archive_folder)
1394                        self.synapse_entity_tracker.add(
1395                            synapse_id=dataset_archive_folder.id,
1396                            entity=dataset_archive_folder,
1397                        )
1398
1399                        for entityId in annotation_entities:
1400                            # move entities to folder
1401                            if not dry_run:
1402                                moved_entity = self.syn.move(
1403                                    entityId, dataset_archive_folder.id
1404                                )
1405                                self.synapse_entity_tracker.add(
1406                                    synapse_id=moved_entity.id, entity=moved_entity
1407                                )
1408                            else:
1409                                logging.info(
1410                                    f"{entityId} will be moved to folder {dataset_archive_folder.id}."
1411                                )
1412        else:
1413            raise LookupError(
1414                f"No datasets were found in the specified project: {projectId}. Re-check specified master_fileview in CONFIG and retry."
1415            )
1416        return manifests, manifest_loaded

For each manifest csv in a project, look for all the entitiy ids that are associated. Look up the entitiy in the files, move the entity to new project.

@tracer.start_as_current_span('SynapseStorage::get_synapse_table')
def get_synapse_table( self, synapse_id: str) -> Tuple[pandas.core.frame.DataFrame, synapseclient.table.CsvFileTable]:
1418    @tracer.start_as_current_span("SynapseStorage::get_synapse_table")
1419    def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable]:
1420        """Download synapse table as a pd dataframe; return table schema and etags as results too
1421
1422        Args:
1423            synapse_id: synapse ID of the table to query
1424        """
1425
1426        results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id))
1427        df = results.asDataFrame(
1428            rowIdAndVersionInIndex=False,
1429            na_values=STR_NA_VALUES_FILTERED,
1430            keep_default_na=False,
1431        )
1432
1433        return df, results

Download synapse table as a pd dataframe; return table schema and etags as results too

Arguments:
  • synapse_id: synapse ID of the table to query
def uploadDB(*args, **kwargs):
540        def wrapper(*args, **kwargs):
541            try:
542                return method(*args, **kwargs)
543            except SynapseHTTPError as ex:
544                str_message = str(ex).replace("\n", "")
545                if "trash" in str_message or "does not exist" in str_message:
546                    logging.warning(str_message)
547                    return None
548                else:
549                    raise ex

Method to upload a database to an asset store. In synapse, this will upload a metadata table

Arguments:
  • dmge: DataModelGraphExplorer object
  • manifest: pd.Df manifest to upload
  • datasetId: synID of the dataset for the manifest
  • table_name: name of the table to be uploaded
  • restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
  • existingTableId: str of the synId of the existing table, if one already exists
  • table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
  • table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
Returns:

manifest_table_id: synID of the uploaded table manifest: the original manifset table_manifest: manifest formatted appropriately for the table

@tracer.start_as_current_span('SynapseStorage::formatDB')
def formatDB(self, dmge, manifest, table_column_names):
1484    @tracer.start_as_current_span("SynapseStorage::formatDB")
1485    def formatDB(self, dmge, manifest, table_column_names):
1486        """
1487        Method to format a manifest appropriatly for upload as table
1488
1489        Args:
1490            dmge: DataModelGraphExplorer object
1491            manifest: pd.Df manifest to upload
1492            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
1493                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
1494                display label formatting.
1495        Returns:
1496            col_schema: schema for table columns: type, size, etc
1497            table_manifest: formatted manifest
1498
1499        """
1500        # Rename the manifest columns to display names to match fileview
1501
1502        blacklist_chars = ["(", ")", ".", " ", "-"]
1503        manifest_columns = manifest.columns.tolist()
1504
1505        table_manifest = deepcopy(manifest)
1506
1507        if table_column_names == "display_name":
1508            cols = table_manifest.columns
1509
1510        elif table_column_names == "display_label":
1511            cols = [
1512                str(col).translate({ord(x): "" for x in blacklist_chars})
1513                for col in manifest_columns
1514            ]
1515
1516        elif table_column_names == "class_label":
1517            cols = [
1518                get_class_label_from_display_name(str(col)).translate(
1519                    {ord(x): "" for x in blacklist_chars}
1520                )
1521                for col in manifest_columns
1522            ]
1523        else:
1524            ValueError(
1525                f"The provided table_column_name: {table_column_names} is not valid, please resubmit with an allowed value only."
1526            )
1527
1528        cols = list(map(lambda x: x.replace("EntityId", "entityId"), cols))
1529
1530        # Reset column names in table manifest
1531        table_manifest.columns = cols
1532
1533        # move entity id to end of df
1534        entity_col = table_manifest.pop("entityId")
1535        table_manifest.insert(len(table_manifest.columns), "entityId", entity_col)
1536
1537        # Get the column schema
1538        col_schema = as_table_columns(table_manifest)
1539
1540        # Set Id column length to 64 (for some reason not being auto set.)
1541        for i, col in enumerate(col_schema):
1542            if col["name"].lower() == "id":
1543                col_schema[i]["maximumSize"] = 64
1544
1545        return col_schema, table_manifest

Method to format a manifest appropriatly for upload as table

Arguments:
  • dmge: DataModelGraphExplorer object
  • manifest: pd.Df manifest to upload
  • table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
Returns:

col_schema: schema for table columns: type, size, etc table_manifest: formatted manifest

@tracer.start_as_current_span('SynapseStorage::buildDB')
def buildDB( self, datasetId: str, table_name: str, col_schema: List, table_manifest: pandas.core.frame.DataFrame, table_manipulation: str, dmge: schematic.schemas.data_model_graph.DataModelGraphExplorer, restrict: bool = False):
1547    @tracer.start_as_current_span("SynapseStorage::buildDB")
1548    def buildDB(
1549        self,
1550        datasetId: str,
1551        table_name: str,
1552        col_schema: List,
1553        table_manifest: pd.DataFrame,
1554        table_manipulation: str,
1555        dmge: DataModelGraphExplorer,
1556        restrict: bool = False,
1557    ):
1558        """
1559        Method to construct the table appropriately: create new table, replace existing, or upsert new into existing
1560        Calls TableOperations class to execute
1561
1562        Args:
1563            datasetId: synID of the dataset for the manifest
1564            table_name: name of the table to be uploaded
1565            col_schema: schema for table columns: type, size, etc from `formatDB`
1566            table_manifest: formatted manifest that can be uploaded as a table
1567            table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
1568            restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
1569
1570        Returns:
1571            manifest_table_id: synID of the uploaded table
1572
1573        """
1574        table_parent_id = self.getDatasetProject(datasetId=datasetId)
1575        existing_table_id = self.syn.findEntityId(
1576            name=table_name, parent=table_parent_id
1577        )
1578
1579        tableOps = TableOperations(
1580            synStore=self,
1581            tableToLoad=table_manifest,
1582            tableName=table_name,
1583            datasetId=datasetId,
1584            existingTableId=existing_table_id,
1585            restrict=restrict,
1586            synapse_entity_tracker=self.synapse_entity_tracker,
1587        )
1588
1589        if not table_manipulation or existing_table_id is None:
1590            manifest_table_id = tableOps.createTable(
1591                columnTypeDict=col_schema,
1592                specifySchema=True,
1593            )
1594        elif existing_table_id is not None:
1595            if table_manipulation.lower() == "replace":
1596                manifest_table_id = tableOps.replaceTable(
1597                    specifySchema=True,
1598                    columnTypeDict=col_schema,
1599                )
1600            elif table_manipulation.lower() == "upsert":
1601                manifest_table_id = tableOps.upsertTable(
1602                    dmge=dmge,
1603                )
1604            elif table_manipulation.lower() == "update":
1605                manifest_table_id = tableOps.updateTable()
1606
1607        if table_manipulation and table_manipulation.lower() == "upsert":
1608            table_entity = self.synapse_entity_tracker.get(
1609                synapse_id=existing_table_id or manifest_table_id,
1610                syn=self.syn,
1611                download_file=False,
1612            )
1613            annos = OldAnnotations(
1614                id=table_entity.id,
1615                etag=table_entity.etag,
1616                values=table_entity.annotations,
1617            )
1618            annos["primary_key"] = table_manifest["Component"][0] + "_id"
1619            annos = self.syn.set_annotations(annos)
1620            table_entity.etag = annos.etag
1621            table_entity.annotations = annos
1622
1623        return manifest_table_id

Method to construct the table appropriately: create new table, replace existing, or upsert new into existing Calls TableOperations class to execute

Arguments:
  • datasetId: synID of the dataset for the manifest
  • table_name: name of the table to be uploaded
  • col_schema: schema for table columns: type, size, etc from formatDB
  • table_manifest: formatted manifest that can be uploaded as a table
  • table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
  • restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
Returns:

manifest_table_id: synID of the uploaded table

@tracer.start_as_current_span('SynapseStorage::upload_manifest_file')
def upload_manifest_file( self, manifest, metadataManifestPath, datasetId, restrict_manifest, component_name=''):
1625    @tracer.start_as_current_span("SynapseStorage::upload_manifest_file")
1626    def upload_manifest_file(
1627        self,
1628        manifest,
1629        metadataManifestPath,
1630        datasetId,
1631        restrict_manifest,
1632        component_name="",
1633    ):
1634        # Update manifest to have the new entityId column
1635        manifest.to_csv(metadataManifestPath, index=False)
1636
1637        # store manifest to Synapse as a CSV
1638        # update file name
1639        file_name_full = metadataManifestPath.split("/")[-1]
1640        file_extension = file_name_full.split(".")[-1]
1641
1642        # Differentiate "censored" and "uncensored" manifest
1643        if "censored" in file_name_full:
1644            file_name_new = (
1645                os.path.basename(CONFIG.synapse_manifest_basename)
1646                + "_"
1647                + component_name
1648                + "_censored"
1649                + "."
1650                + file_extension
1651            )
1652        else:
1653            file_name_new = (
1654                os.path.basename(CONFIG.synapse_manifest_basename)
1655                + "_"
1656                + component_name
1657                + "."
1658                + file_extension
1659            )
1660
1661        manifest_synapse_file = None
1662        try:
1663            # Rename the file to file_name_new then revert
1664            # This is to maintain the original file name in-case other code is
1665            # expecting that the file exists with the original name
1666            original_file_path = metadataManifestPath
1667            new_file_path = os.path.join(
1668                os.path.dirname(metadataManifestPath), file_name_new
1669            )
1670            os.rename(original_file_path, new_file_path)
1671
1672            manifest_synapse_file = self._store_file_for_manifest_upload(
1673                new_file_path=new_file_path,
1674                dataset_id=datasetId,
1675                existing_file_name=file_name_full,
1676                file_name_new=file_name_new,
1677                restrict_manifest=restrict_manifest,
1678            )
1679            manifest_synapse_file_id = manifest_synapse_file.id
1680
1681        finally:
1682            # Revert the file name back to the original
1683            os.rename(new_file_path, original_file_path)
1684
1685            if manifest_synapse_file:
1686                manifest_synapse_file.path = original_file_path
1687
1688        return manifest_synapse_file_id
async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]:
1745    async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]:
1746        """get annotations asynchronously
1747
1748        Args:
1749            synapse_id (str): synapse id of the entity that the annotation belongs
1750
1751        Returns:
1752            Dict[str, Any]: The requested entity bundle matching
1753            <https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html>
1754        """
1755        return await get_entity_id_bundle2(
1756            entity_id=synapse_id,
1757            request={"includeAnnotations": True},
1758            synapse_client=self.syn,
1759        )

get annotations asynchronously

Arguments:
  • synapse_id (str): synapse id of the entity that the annotation belongs
Returns:

Dict[str, Any]: The requested entity bundle matching https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html

async def store_async_annotation( self, annotation_dict: dict) -> synapseclient.models.annotations.Annotations:
1761    async def store_async_annotation(self, annotation_dict: dict) -> Annotations:
1762        """store annotation in an async way
1763
1764        Args:
1765            annotation_dict (dict): annotation in a dictionary format
1766
1767        Returns:
1768            Annotations: The stored annotations.
1769        """
1770        annotation_data = Annotations.from_dict(
1771            synapse_annotations=annotation_dict["annotations"]["annotations"]
1772        )
1773        annotation_class = Annotations(
1774            annotations=annotation_data,
1775            etag=annotation_dict["annotations"]["etag"],
1776            id=annotation_dict["annotations"]["id"],
1777        )
1778        annotation_storage_result = await annotation_class.store_async(
1779            synapse_client=self.syn
1780        )
1781        local_entity = self.synapse_entity_tracker.get(
1782            synapse_id=annotation_dict["annotations"]["id"],
1783            syn=self.syn,
1784            download_file=False,
1785            retrieve_if_not_present=False,
1786        )
1787        if local_entity:
1788            local_entity.etag = annotation_storage_result.etag
1789            local_entity.annotations = annotation_storage_result
1790        return annotation_storage_result

store annotation in an async way

Arguments:
  • annotation_dict (dict): annotation in a dictionary format
Returns:

Annotations: The stored annotations.

def process_row_annotations( self, dmge: schematic.schemas.data_model_graph.DataModelGraphExplorer, metadata_syn: Dict[str, Any], hide_blanks: bool, csv_list_regex: str, annos: Dict[str, Any], annotation_keys: str) -> Dict[str, Any]:
1792    def process_row_annotations(
1793        self,
1794        dmge: DataModelGraphExplorer,
1795        metadata_syn: Dict[str, Any],
1796        hide_blanks: bool,
1797        csv_list_regex: str,
1798        annos: Dict[str, Any],
1799        annotation_keys: str,
1800    ) -> Dict[str, Any]:
1801        """Processes metadata annotations based on the logic below:
1802        1. Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is:
1803            An empty or whitespace-only string.
1804            A NaN value (if the annotation is a float).
1805        if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key.
1806        if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key.
1807
1808        2. If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name".
1809        Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key.
1810
1811        3. For any other conditions, assigns the original value of anno_v to the annotation key (anno_k).
1812
1813        4. Returns the updated annotations dictionary.
1814
1815        Args:
1816            dmge (DataModelGraphExplorer): data model graph explorer
1817            metadata_syn (dict): metadata used for Synapse storage
1818            hideBlanks (bool): if true, does not upload annotation keys with blank values.
1819            csv_list_regex (str): Regex to match with comma separated list
1820            annos (Dict[str, Any]): dictionary of annotation returned from synapse
1821            annotation_keys (str): display_label/class_label
1822
1823        Returns:
1824            Dict[str, Any]: annotations as a dictionary
1825
1826        ```mermaid
1827        flowchart TD
1828            A[Start] --> C{Is anno_v empty, whitespace, or NaN?}
1829            C -- Yes --> D{Is hide_blanks True?}
1830            D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing]
1831            D -- No --> F[Assign empty string to annotation key]
1832            C -- No --> G{Is anno_v a string?}
1833            G -- No --> H[Assign original value of anno_v to annotation key]
1834            G -- Yes --> I{Does anno_v match csv_list_regex?}
1835            I -- Yes --> J[Get validation rule of anno_k]
1836            J --> K{Does the validation rule contain 'list'}
1837            K -- Yes --> L[Split anno_v by commas and assign as list]
1838            I -- No --> H
1839            K -- No --> H
1840        ```
1841        """
1842        for anno_k, anno_v in metadata_syn.items():
1843            # Remove keys with nan or empty string values or string that only contains white space from dict of annotations to be uploaded
1844            # if present on current data annotation
1845            if hide_blanks and (
1846                (isinstance(anno_v, str) and anno_v.strip() == "")
1847                or (isinstance(anno_v, float) and np.isnan(anno_v))
1848            ):
1849                annos["annotations"]["annotations"].pop(anno_k) if anno_k in annos[
1850                    "annotations"
1851                ]["annotations"].keys() else annos["annotations"]["annotations"]
1852                continue
1853
1854            # Otherwise save annotation as approrpriate
1855            if isinstance(anno_v, float) and np.isnan(anno_v):
1856                annos["annotations"]["annotations"][anno_k] = ""
1857                continue
1858
1859            # Handle strings that match the csv_list_regex and pass the validation rule
1860            if isinstance(anno_v, str) and re.fullmatch(csv_list_regex, anno_v):
1861                # Use a dictionary to dynamically choose the argument
1862                param = (
1863                    {"node_display_name": anno_k}
1864                    if annotation_keys == "display_label"
1865                    else {"node_label": anno_k}
1866                )
1867                node_validation_rules = dmge.get_node_validation_rules(**param)
1868
1869                if rule_in_rule_list("list", node_validation_rules):
1870                    annos["annotations"]["annotations"][anno_k] = anno_v.split(",")
1871                    continue
1872            # default: assign the original value
1873            annos["annotations"]["annotations"][anno_k] = anno_v
1874
1875        return annos

Processes metadata annotations based on the logic below:

  1. Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is: An empty or whitespace-only string. A NaN value (if the annotation is a float). if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key. if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key.
  1. If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name". Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key.

  2. For any other conditions, assigns the original value of anno_v to the annotation key (anno_k).

  3. Returns the updated annotations dictionary.

Arguments:
  • dmge (DataModelGraphExplorer): data model graph explorer
  • metadata_syn (dict): metadata used for Synapse storage
  • hideBlanks (bool): if true, does not upload annotation keys with blank values.
  • csv_list_regex (str): Regex to match with comma separated list
  • annos (Dict[str, Any]): dictionary of annotation returned from synapse
  • annotation_keys (str): display_label/class_label
Returns:

Dict[str, Any]: annotations as a dictionary

flowchart TD A[Start] --> C{Is anno_v empty, whitespace, or NaN?} C -- Yes --> D{Is hide_blanks True?} D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing] D -- No --> F[Assign empty string to annotation key] C -- No --> G{Is anno_v a string?} G -- No --> H[Assign original value of anno_v to annotation key] G -- Yes --> I{Does anno_v match csv_list_regex?} I -- Yes --> J[Get validation rule of anno_k] J --> K{Does the validation rule contain 'list'} K -- Yes --> L[Split anno_v by commas and assign as list] I -- No --> H K -- No --> H
async def format_row_annotations(*args: Any, **kwargs: Any) -> Any:
556        async def wrapper(*args: Any, **kwargs: Any) -> Any:
557            try:
558                return await method(*args, **kwargs)
559            except SynapseHTTPError as ex:
560                str_message = str(ex).replace("\n", "")
561                if "trash" in str_message or "does not exist" in str_message:
562                    logging.warning(str_message)
563                    return None
564                else:
565                    raise ex
def format_manifest_annotations(*args, **kwargs):
540        def wrapper(*args, **kwargs):
541            try:
542                return method(*args, **kwargs)
543            except SynapseHTTPError as ex:
544                str_message = str(ex).replace("\n", "")
545                if "trash" in str_message or "does not exist" in str_message:
546                    logging.warning(str_message)
547                    return None
548                else:
549                    raise ex

Set annotations for the manifest (as a whole) so they can be applied to the manifest table or csv. For now just getting the Component.

@tracer.start_as_current_span('SynapseStorage::add_annotations_to_entities_files')
async def add_annotations_to_entities_files( self, dmge, manifest, manifest_record_type: str, datasetId: str, hideBlanks: bool, manifest_synapse_table_id='', annotation_keys: str = 'class_label'):
2237    @tracer.start_as_current_span("SynapseStorage::add_annotations_to_entities_files")
2238    async def add_annotations_to_entities_files(
2239        self,
2240        dmge,
2241        manifest,
2242        manifest_record_type: str,
2243        datasetId: str,
2244        hideBlanks: bool,
2245        manifest_synapse_table_id="",
2246        annotation_keys: str = "class_label",
2247    ):
2248        """
2249        Depending on upload type add Ids to entityId row. Add anotations to connected
2250        files and folders. Despite the name of this function, it also applies to folders.
2251
2252        Args:
2253            dmge: DataModelGraphExplorer Object
2254            manifest (pd.DataFrame): loaded df containing user supplied data.
2255            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2256            datasetId (str): synapse ID of folder containing the dataset
2257            hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2258            manifest_synapse_table_id (str): Default is an empty string ''.
2259            annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display
2260                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2261                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2262        Returns:
2263            manifest (pd.DataFrame): modified to add entitiyId as appropriate
2264
2265        """
2266
2267        # Expected behavior is to annotate files if `Filename` is present and if file_annotations_upload is set to True regardless of `-mrt` setting
2268        if "filename" in [col.lower() for col in manifest.columns]:
2269            # get current list of files and store as dataframe
2270            dataset_files = self.getFilesInStorageDataset(datasetId)
2271            files_and_entityIds = self._get_file_entityIds(
2272                dataset_files=dataset_files, only_new_files=False
2273            )
2274            file_df = pd.DataFrame(files_and_entityIds)
2275
2276            # Merge dataframes to add entityIds
2277            manifest = manifest.merge(
2278                file_df, how="left", on="Filename", suffixes=["_x", None]
2279            ).drop("entityId_x", axis=1)
2280
2281        # Fill `entityId` for each row if missing and annotate entity as appropriate
2282        requests = set()
2283        for idx, row in manifest.iterrows():
2284            if not row["entityId"] and (
2285                manifest_record_type == "file_and_entities"
2286                or manifest_record_type == "table_file_and_entities"
2287            ):
2288                manifest, entityId = self._create_entity_id(
2289                    idx, row, manifest, datasetId
2290                )
2291            elif not row["entityId"] and manifest_record_type == "table_and_file":
2292                # If not using entityIds, fill with manifest_table_id so
2293                row["entityId"] = manifest_synapse_table_id
2294                manifest.loc[idx, "entityId"] = manifest_synapse_table_id
2295                entityId = ""
2296                # If the row is the manifest table, do not add annotations
2297            elif row["entityId"] == manifest_synapse_table_id:
2298                entityId = ""
2299            else:
2300                # get the file id of the file to annotate, collected in above step.
2301                entityId = row["entityId"]
2302
2303            # Adding annotations to connected files.
2304            if entityId:
2305                # Format annotations for Synapse
2306                annos_task = asyncio.create_task(
2307                    self.format_row_annotations(
2308                        dmge, row, entityId, hideBlanks, annotation_keys
2309                    )
2310                )
2311                requests.add(annos_task)
2312        await self._process_store_annos(requests)
2313        return manifest

Depending on upload type add Ids to entityId row. Add anotations to connected files and folders. Despite the name of this function, it also applies to folders.

Arguments:
  • dmge: DataModelGraphExplorer Object
  • manifest (pd.DataFrame): loaded df containing user supplied data.
  • manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
  • datasetId (str): synapse ID of folder containing the dataset
  • hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
  • manifest_synapse_table_id (str): Default is an empty string ''.
  • annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
Returns:

manifest (pd.DataFrame): modified to add entitiyId as appropriate

@tracer.start_as_current_span('SynapseStorage::upload_manifest_as_table')
def upload_manifest_as_table( self, dmge: schematic.schemas.data_model_graph.DataModelGraphExplorer, manifest: pandas.core.frame.DataFrame, metadataManifestPath: str, datasetId: str, table_name: str, component_name: str, restrict: bool, manifest_record_type: str, hideBlanks: bool, table_manipulation: str, table_column_names: str, annotation_keys: str, file_annotations_upload: bool = True):
2315    @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_table")
2316    def upload_manifest_as_table(
2317        self,
2318        dmge: DataModelGraphExplorer,
2319        manifest: pd.DataFrame,
2320        metadataManifestPath: str,
2321        datasetId: str,
2322        table_name: str,
2323        component_name: str,
2324        restrict: bool,
2325        manifest_record_type: str,
2326        hideBlanks: bool,
2327        table_manipulation: str,
2328        table_column_names: str,
2329        annotation_keys: str,
2330        file_annotations_upload: bool = True,
2331    ):
2332        """Upload manifest to Synapse as a table and csv.
2333        Args:
2334            dmge: DataModelGraphExplorer object
2335            manifest (pd.DataFrame): loaded df containing user supplied data.
2336            metadataManifestPath: path to csv containing a validated metadata manifest.
2337            datasetId (str): synapse ID of folder containing the dataset
2338            table_name (str): Generated to name the table being uploaded.
2339            component_name (str): Name of the component manifest that is currently being uploaded.
2340            restrict (bool): Flag for censored data.
2341            manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2342            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2343            table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2344            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2345                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2346                display label formatting.
2347            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2348                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2349                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2350            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2351        Return:
2352            manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2353        """
2354        # Upload manifest as a table, get the ID and updated manifest.
2355        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2356            dmge=dmge,
2357            manifest=manifest,
2358            datasetId=datasetId,
2359            table_name=table_name,
2360            restrict=restrict,
2361            table_manipulation=table_manipulation,
2362            table_column_names=table_column_names,
2363        )
2364
2365        if file_annotations_upload:
2366            manifest = asyncio.run(
2367                self.add_annotations_to_entities_files(
2368                    dmge,
2369                    manifest,
2370                    manifest_record_type,
2371                    datasetId,
2372                    hideBlanks,
2373                    manifest_synapse_table_id,
2374                    annotation_keys,
2375                )
2376            )
2377        # Load manifest to synapse as a CSV File
2378        manifest_synapse_file_id = self.upload_manifest_file(
2379            manifest=manifest,
2380            metadataManifestPath=metadataManifestPath,
2381            datasetId=datasetId,
2382            restrict_manifest=restrict,
2383            component_name=component_name,
2384        )
2385
2386        # Set annotations for the file manifest.
2387        manifest_annotations = self.format_manifest_annotations(
2388            manifest=manifest, manifest_synapse_id=manifest_synapse_file_id
2389        )
2390        annos = self.syn.set_annotations(annotations=manifest_annotations)
2391        manifest_entity = self.synapse_entity_tracker.get(
2392            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2393        )
2394        manifest_entity.annotations = annos
2395        manifest_entity.etag = annos.etag
2396
2397        logger.info("Associated manifest file with dataset on Synapse.")
2398
2399        # Update manifest Synapse table with new entity id column.
2400        manifest_synapse_table_id, manifest, _ = self.uploadDB(
2401            dmge=dmge,
2402            manifest=manifest,
2403            datasetId=datasetId,
2404            table_name=table_name,
2405            restrict=restrict,
2406            table_manipulation="update",
2407            table_column_names=table_column_names,
2408        )
2409
2410        # Set annotations for the table manifest
2411        manifest_annotations = self.format_manifest_annotations(
2412            manifest=manifest, manifest_synapse_id=manifest_synapse_table_id
2413        )
2414        annotations_manifest_table = self.syn.set_annotations(
2415            annotations=manifest_annotations
2416        )
2417        manifest_table_entity = self.synapse_entity_tracker.get(
2418            synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False
2419        )
2420        manifest_table_entity.annotations = annotations_manifest_table
2421        manifest_table_entity.etag = annotations_manifest_table.etag
2422
2423        return manifest_synapse_file_id

Upload manifest to Synapse as a table and csv.

Arguments:
  • dmge: DataModelGraphExplorer object
  • manifest (pd.DataFrame): loaded df containing user supplied data.
  • metadataManifestPath: path to csv containing a validated metadata manifest.
  • datasetId (str): synapse ID of folder containing the dataset
  • table_name (str): Generated to name the table being uploaded.
  • component_name (str): Name of the component manifest that is currently being uploaded.
  • restrict (bool): Flag for censored data.
  • manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
  • hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
  • table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
  • table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
  • annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
  • file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
Return:

manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.

@tracer.start_as_current_span('SynapseStorage::upload_manifest_as_csv')
def upload_manifest_as_csv( self, dmge, manifest, metadataManifestPath, datasetId, restrict, manifest_record_type, hideBlanks, component_name, annotation_keys: str, file_annotations_upload: bool = True):
2425    @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_csv")
2426    def upload_manifest_as_csv(
2427        self,
2428        dmge,
2429        manifest,
2430        metadataManifestPath,
2431        datasetId,
2432        restrict,
2433        manifest_record_type,
2434        hideBlanks,
2435        component_name,
2436        annotation_keys: str,
2437        file_annotations_upload: bool = True,
2438    ):
2439        """Upload manifest to Synapse as a csv only.
2440        Args:
2441            dmge: DataModelGraphExplorer object
2442            manifest (pd.DataFrame): loaded df containing user supplied data.
2443            metadataManifestPath: path to csv containing a validated metadata manifest.
2444            datasetId (str): synapse ID of folder containing the dataset
2445            restrict (bool): Flag for censored data.
2446            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2447            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2448            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2449                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2450                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2451            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2452        Return:
2453            manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2454        """
2455        if file_annotations_upload:
2456            manifest = asyncio.run(
2457                self.add_annotations_to_entities_files(
2458                    dmge,
2459                    manifest,
2460                    manifest_record_type,
2461                    datasetId,
2462                    hideBlanks,
2463                    annotation_keys=annotation_keys,
2464                )
2465            )
2466
2467        # Load manifest to synapse as a CSV File
2468        manifest_synapse_file_id = self.upload_manifest_file(
2469            manifest,
2470            metadataManifestPath,
2471            datasetId,
2472            restrict,
2473            component_name=component_name,
2474        )
2475
2476        # Set annotations for the file manifest.
2477        manifest_annotations = self.format_manifest_annotations(
2478            manifest, manifest_synapse_file_id
2479        )
2480        annos = self.syn.set_annotations(manifest_annotations)
2481        manifest_entity = self.synapse_entity_tracker.get(
2482            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2483        )
2484        manifest_entity.annotations = annos
2485        manifest_entity.etag = annos.etag
2486
2487        logger.info("Associated manifest file with dataset on Synapse.")
2488
2489        return manifest_synapse_file_id

Upload manifest to Synapse as a csv only.

Arguments:
  • dmge: DataModelGraphExplorer object
  • manifest (pd.DataFrame): loaded df containing user supplied data.
  • metadataManifestPath: path to csv containing a validated metadata manifest.
  • datasetId (str): synapse ID of folder containing the dataset
  • restrict (bool): Flag for censored data.
  • manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
  • hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
  • annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
  • file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
Return:

manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.

@tracer.start_as_current_span('SynapseStorage::upload_manifest_combo')
def upload_manifest_combo( self, dmge, manifest, metadataManifestPath, datasetId, table_name, component_name, restrict, manifest_record_type, hideBlanks, table_manipulation, table_column_names: str, annotation_keys: str, file_annotations_upload: bool = True):
2491    @tracer.start_as_current_span("SynapseStorage::upload_manifest_combo")
2492    def upload_manifest_combo(
2493        self,
2494        dmge,
2495        manifest,
2496        metadataManifestPath,
2497        datasetId,
2498        table_name,
2499        component_name,
2500        restrict,
2501        manifest_record_type,
2502        hideBlanks,
2503        table_manipulation,
2504        table_column_names: str,
2505        annotation_keys: str,
2506        file_annotations_upload: bool = True,
2507    ):
2508        """Upload manifest to Synapse as a table and CSV with entities.
2509        Args:
2510            dmge: DataModelGraphExplorer object
2511            manifest (pd.DataFrame): loaded df containing user supplied data.
2512            metadataManifestPath: path to csv containing a validated metadata manifest.
2513            datasetId (str): synapse ID of folder containing the dataset
2514            table_name (str): Generated to name the table being uploaded.
2515            component_name (str): Name of the component manifest that is currently being uploaded.
2516            restrict (bool): Flag for censored data.
2517            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2518            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2519            table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2520            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2521                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2522                display label formatting.
2523            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2524                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2525                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2526            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2527        Return:
2528            manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2529        """
2530        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2531            dmge=dmge,
2532            manifest=manifest,
2533            datasetId=datasetId,
2534            table_name=table_name,
2535            restrict=restrict,
2536            table_manipulation=table_manipulation,
2537            table_column_names=table_column_names,
2538        )
2539
2540        if file_annotations_upload:
2541            manifest = asyncio.run(
2542                self.add_annotations_to_entities_files(
2543                    dmge,
2544                    manifest,
2545                    manifest_record_type,
2546                    datasetId,
2547                    hideBlanks,
2548                    manifest_synapse_table_id,
2549                    annotation_keys=annotation_keys,
2550                )
2551            )
2552
2553        # Load manifest to synapse as a CSV File
2554        manifest_synapse_file_id = self.upload_manifest_file(
2555            manifest, metadataManifestPath, datasetId, restrict, component_name
2556        )
2557
2558        # Set annotations for the file manifest.
2559        manifest_annotations = self.format_manifest_annotations(
2560            manifest, manifest_synapse_file_id
2561        )
2562        file_manifest_annoations = self.syn.set_annotations(manifest_annotations)
2563        manifest_entity = self.synapse_entity_tracker.get(
2564            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2565        )
2566        manifest_entity.annotations = file_manifest_annoations
2567        manifest_entity.etag = file_manifest_annoations.etag
2568        logger.info("Associated manifest file with dataset on Synapse.")
2569
2570        # Update manifest Synapse table with new entity id column.
2571        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2572            dmge=dmge,
2573            manifest=manifest,
2574            datasetId=datasetId,
2575            table_name=table_name,
2576            restrict=restrict,
2577            table_manipulation="update",
2578            table_column_names=table_column_names,
2579        )
2580
2581        # Set annotations for the table manifest
2582        manifest_annotations = self.format_manifest_annotations(
2583            manifest, manifest_synapse_table_id
2584        )
2585        table_manifest_annotations = self.syn.set_annotations(manifest_annotations)
2586        manifest_entity = self.synapse_entity_tracker.get(
2587            synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False
2588        )
2589        manifest_entity.annotations = table_manifest_annotations
2590        manifest_entity.etag = table_manifest_annotations.etag
2591        return manifest_synapse_file_id

Upload manifest to Synapse as a table and CSV with entities.

Arguments:
  • dmge: DataModelGraphExplorer object
  • manifest (pd.DataFrame): loaded df containing user supplied data.
  • metadataManifestPath: path to csv containing a validated metadata manifest.
  • datasetId (str): synapse ID of folder containing the dataset
  • table_name (str): Generated to name the table being uploaded.
  • component_name (str): Name of the component manifest that is currently being uploaded.
  • restrict (bool): Flag for censored data.
  • manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
  • hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
  • table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
  • table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
  • annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
  • file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
Return:

manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.

@tracer.start_as_current_span('SynapseStorage::associateMetadataWithFiles')
def associateMetadataWithFiles( self, dmge: schematic.schemas.data_model_graph.DataModelGraphExplorer, metadataManifestPath: str, datasetId: str, manifest_record_type: str = 'table_file_and_entities', hideBlanks: bool = False, restrict_manifest=False, table_manipulation: str = 'replace', table_column_names: str = 'class_label', annotation_keys: str = 'class_label', file_annotations_upload: bool = True) -> str:
2593    @tracer.start_as_current_span("SynapseStorage::associateMetadataWithFiles")
2594    def associateMetadataWithFiles(
2595        self,
2596        dmge: DataModelGraphExplorer,
2597        metadataManifestPath: str,
2598        datasetId: str,
2599        manifest_record_type: str = "table_file_and_entities",
2600        hideBlanks: bool = False,
2601        restrict_manifest=False,
2602        table_manipulation: str = "replace",
2603        table_column_names: str = "class_label",
2604        annotation_keys: str = "class_label",
2605        file_annotations_upload: bool = True,
2606    ) -> str:
2607        """Associate metadata with files in a storage dataset already on Synapse.
2608        Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file.
2609
2610        If this is a new manifest there could be no Synapse entities associated with the rows of this manifest
2611        this may be due to data type (e.g. clinical data) being tabular
2612        and not requiring files; to utilize uniform interfaces downstream
2613        (i.e. fileviews), a Synapse entity (a folder) is created for each row
2614        and an entity column is added to the manifest containing the resulting
2615        entity IDs; a table is also created at present as an additional interface
2616        for downstream query and interaction with the data.
2617
2618        Args:
2619            dmge: DataModelGraphExplorer Object
2620            metadataManifestPath: path to csv containing a validated metadata manifest.
2621            The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type.
2622            Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item.
2623            In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file.
2624            datasetId: synapse ID of folder containing the dataset
2625            manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination.
2626            hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2627            restrict_manifest (bool): Default is false. Flag for censored data.
2628            table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2629            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2630                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2631                display label formatting.
2632            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2633                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2634                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2635        Returns:
2636            manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2637        """
2638        # Read new manifest CSV:
2639        manifest = self._read_manifest(metadataManifestPath)
2640        manifest = self._add_id_columns_to_manifest(manifest, dmge)
2641
2642        table_name, component_name = self._generate_table_name(manifest)
2643
2644        # Upload manifest to synapse based on user input (manifest_record_type)
2645        if manifest_record_type == "file_only":
2646            manifest_synapse_file_id = self.upload_manifest_as_csv(
2647                dmge=dmge,
2648                manifest=manifest,
2649                metadataManifestPath=metadataManifestPath,
2650                datasetId=datasetId,
2651                restrict=restrict_manifest,
2652                hideBlanks=hideBlanks,
2653                manifest_record_type=manifest_record_type,
2654                component_name=component_name,
2655                annotation_keys=annotation_keys,
2656                file_annotations_upload=file_annotations_upload,
2657            )
2658        elif manifest_record_type == "table_and_file":
2659            manifest_synapse_file_id = self.upload_manifest_as_table(
2660                dmge=dmge,
2661                manifest=manifest,
2662                metadataManifestPath=metadataManifestPath,
2663                datasetId=datasetId,
2664                table_name=table_name,
2665                component_name=component_name,
2666                restrict=restrict_manifest,
2667                hideBlanks=hideBlanks,
2668                manifest_record_type=manifest_record_type,
2669                table_manipulation=table_manipulation,
2670                table_column_names=table_column_names,
2671                annotation_keys=annotation_keys,
2672                file_annotations_upload=file_annotations_upload,
2673            )
2674        elif manifest_record_type == "file_and_entities":
2675            manifest_synapse_file_id = self.upload_manifest_as_csv(
2676                dmge=dmge,
2677                manifest=manifest,
2678                metadataManifestPath=metadataManifestPath,
2679                datasetId=datasetId,
2680                restrict=restrict_manifest,
2681                hideBlanks=hideBlanks,
2682                manifest_record_type=manifest_record_type,
2683                component_name=component_name,
2684                annotation_keys=annotation_keys,
2685                file_annotations_upload=file_annotations_upload,
2686            )
2687        elif manifest_record_type == "table_file_and_entities":
2688            manifest_synapse_file_id = self.upload_manifest_combo(
2689                dmge=dmge,
2690                manifest=manifest,
2691                metadataManifestPath=metadataManifestPath,
2692                datasetId=datasetId,
2693                table_name=table_name,
2694                component_name=component_name,
2695                restrict=restrict_manifest,
2696                hideBlanks=hideBlanks,
2697                manifest_record_type=manifest_record_type,
2698                table_manipulation=table_manipulation,
2699                table_column_names=table_column_names,
2700                annotation_keys=annotation_keys,
2701                file_annotations_upload=file_annotations_upload,
2702            )
2703        else:
2704            raise ValueError("Please enter a valid manifest_record_type.")
2705        return manifest_synapse_file_id

Associate metadata with files in a storage dataset already on Synapse. Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file.

If this is a new manifest there could be no Synapse entities associated with the rows of this manifest this may be due to data type (e.g. clinical data) being tabular and not requiring files; to utilize uniform interfaces downstream (i.e. fileviews), a Synapse entity (a folder) is created for each row and an entity column is added to the manifest containing the resulting entity IDs; a table is also created at present as an additional interface for downstream query and interaction with the data.

Arguments:
  • dmge: DataModelGraphExplorer Object
  • metadataManifestPath: path to csv containing a validated metadata manifest.
  • The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type.
  • Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item.
  • In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file.
  • datasetId: synapse ID of folder containing the dataset
  • manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination.
  • hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
  • restrict_manifest (bool): Default is false. Flag for censored data.
  • table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
  • table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
  • annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
Returns:

manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.

def getTableAnnotations(self, table_id: str):
2707    def getTableAnnotations(self, table_id: str):
2708        """Generate dictionary of annotations for the given Synapse file.
2709        Synapse returns all custom annotations as lists since they
2710        can contain multiple values. In all cases, the values will
2711        be converted into strings and concatenated with ", ".
2712
2713        Args:
2714            fileId (str): Synapse ID for dataset file.
2715
2716        Returns:
2717            dict: Annotations as comma-separated strings.
2718        """
2719        try:
2720            entity = self.synapse_entity_tracker.get(
2721                synapse_id=table_id, syn=self.syn, download_file=False
2722            )
2723            is_table = entity.concreteType.endswith(".TableEntity")
2724            annotations_raw = entity.annotations
2725        except SynapseHTTPError:
2726            # If an error occurs with retrieving entity, skip it
2727            # This could be caused by a temporary file view that
2728            # was deleted since its ID was retrieved
2729            is_file, is_table = False, False
2730
2731        # Skip anything that isn't a file or folder
2732        if not (is_table):
2733            return None
2734
2735        annotations = self.getEntityAnnotations(table_id, entity, annotations_raw)
2736
2737        return annotations

Generate dictionary of annotations for the given Synapse file. Synapse returns all custom annotations as lists since they can contain multiple values. In all cases, the values will be converted into strings and concatenated with ", ".

Arguments:
  • fileId (str): Synapse ID for dataset file.
Returns:

dict: Annotations as comma-separated strings.

def getFileAnnotations(self, fileId: str) -> Dict[str, str]:
2739    def getFileAnnotations(self, fileId: str) -> Dict[str, str]:
2740        """Generate dictionary of annotations for the given Synapse file.
2741        Synapse returns all custom annotations as lists since they
2742        can contain multiple values. In all cases, the values will
2743        be converted into strings and concatenated with ", ".
2744
2745        Args:
2746            fileId (str): Synapse ID for dataset file.
2747
2748        Returns:
2749            dict: Annotations as comma-separated strings.
2750        """
2751
2752        # Get entity metadata, including annotations
2753        try:
2754            entity = self.synapse_entity_tracker.get(
2755                synapse_id=fileId, syn=self.syn, download_file=False
2756            )
2757            is_file = entity.concreteType.endswith(".FileEntity")
2758            is_folder = entity.concreteType.endswith(".Folder")
2759            annotations_raw = entity.annotations
2760        except SynapseHTTPError:
2761            # If an error occurs with retrieving entity, skip it
2762            # This could be caused by a temporary file view that
2763            # was deleted since its ID was retrieved
2764            is_file, is_folder = False, False
2765
2766        # Skip anything that isn't a file or folder
2767        if not (is_file or is_folder):
2768            return None
2769
2770        annotations = self.getEntityAnnotations(fileId, entity, annotations_raw)
2771
2772        return annotations

Generate dictionary of annotations for the given Synapse file. Synapse returns all custom annotations as lists since they can contain multiple values. In all cases, the values will be converted into strings and concatenated with ", ".

Arguments:
  • fileId (str): Synapse ID for dataset file.
Returns:

dict: Annotations as comma-separated strings.

def getEntityAnnotations(self, fileId, entity, annotations_raw):
2774    def getEntityAnnotations(self, fileId, entity, annotations_raw):
2775        # Extract annotations from their lists and stringify. For example:
2776        # {'YearofBirth': [1980], 'author': ['bruno', 'milen', 'sujay']}
2777        annotations = dict()
2778        for key, vals in annotations_raw.items():
2779            if isinstance(vals, list) and len(vals) == 1:
2780                annotations[key] = str(vals[0])
2781            else:
2782                annotations[key] = ", ".join(str(v) for v in vals)
2783
2784        # Add the file entity ID and eTag, which weren't lists
2785        assert fileId == entity.id, (
2786            "For some reason, the Synapse ID in the response doesn't match"
2787            "the Synapse ID sent in the request (via synapseclient)."
2788        )
2789        annotations["entityId"] = fileId
2790        annotations["eTag"] = entity.etag
2791
2792        return annotations
def getDatasetAnnotations( self, datasetId: str, fill_na: bool = True, force_batch: bool = False) -> pandas.core.frame.DataFrame:
2794    def getDatasetAnnotations(
2795        self, datasetId: str, fill_na: bool = True, force_batch: bool = False
2796    ) -> pd.DataFrame:
2797        """Generate table for annotations across all files in given dataset.
2798
2799        Args:
2800            datasetId (str): Synapse ID for dataset folder.
2801            fill_na (bool): Whether to replace missing values with
2802                blank strings.
2803            force_batch (bool): Whether to force the function to use
2804                the batch mode, which uses a file view to retrieve
2805                annotations for a given dataset. Default to False
2806                unless there are more than 50 files in the dataset.
2807
2808        Returns:
2809            pd.DataFrame: Table of annotations.
2810        """
2811        # Get all files in given dataset
2812        dataset_files = self.getFilesInStorageDataset(datasetId)
2813
2814        # if there are no dataset files, there are no annotations
2815        # return None
2816        if not dataset_files:
2817            return pd.DataFrame()
2818
2819        dataset_files_map = dict(dataset_files)
2820        dataset_file_ids, _ = list(zip(*dataset_files))
2821
2822        # Get annotations for each file from Step 1
2823        # Batch mode
2824        try_batch = len(dataset_files) >= 50 or force_batch
2825        if try_batch:
2826            try:
2827                logger.info("Trying batch mode for retrieving Synapse annotations")
2828                table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids)
2829            except (SynapseAuthenticationError, SynapseHTTPError, ValueError):
2830                logger.info(
2831                    f"Unable to create a temporary file view bound to {datasetId}. "
2832                    "Defaulting to slower iterative retrieval of annotations."
2833                )
2834                # Default to the slower non-batch method
2835                logger.info("Batch mode failed (probably due to permission error)")
2836                try_batch = False
2837
2838        # Non-batch mode
2839        if not try_batch:
2840            logger.info("Using slower (non-batch) sequential mode")
2841            records = [self.getFileAnnotations(i) for i in dataset_file_ids]
2842            # Remove any annotations for non-file/folders (stored as None)
2843            records = filter(None, records)
2844            table = pd.DataFrame.from_records(records)
2845
2846        # Add filenames for the files that "survived" annotation retrieval
2847        filenames = [dataset_files_map[i] for i in table["entityId"]]
2848
2849        if "Filename" not in table.columns:
2850            table.insert(0, "Filename", filenames)
2851
2852        # Ensure that entityId and eTag are at the end
2853        entity_ids = table.pop("entityId")
2854        etags = table.pop("eTag")
2855        table.insert(len(table.columns), "entityId", entity_ids)
2856        table.insert(len(table.columns), "eTag", etags)
2857
2858        # Missing values are filled in with empty strings for Google Sheets
2859        if fill_na:
2860            table.fillna("", inplace=True)
2861
2862        # Force all values as strings
2863        return table.astype(str)

Generate table for annotations across all files in given dataset.

Arguments:
  • datasetId (str): Synapse ID for dataset folder.
  • fill_na (bool): Whether to replace missing values with blank strings.
  • force_batch (bool): Whether to force the function to use the batch mode, which uses a file view to retrieve annotations for a given dataset. Default to False unless there are more than 50 files in the dataset.
Returns:

pd.DataFrame: Table of annotations.

def raise_final_error(retry_state):
2865    def raise_final_error(retry_state):
2866        return retry_state.outcome.result()
def checkIfinAssetView(self, syn_id) -> str:
2868    def checkIfinAssetView(self, syn_id) -> str:
2869        # get data in administrative fileview for this pipeline
2870        assetViewTable = self.getStorageFileviewTable()
2871        all_files = list(assetViewTable["id"])
2872        if syn_id in all_files:
2873            return True
2874        else:
2875            return False
@tracer.start_as_current_span('SynapseStorage::getDatasetProject')
@retry(stop=stop_after_attempt(5), wait=wait_chain(*[wait_fixed(10) for i in range(2)] + [wait_fixed(15) for i in range(2)] + [wait_fixed(20)]), retry=retry_if_exception_type(LookupError), retry_error_callback=raise_final_error)
def getDatasetProject(self, datasetId: str) -> str:
2877    @tracer.start_as_current_span("SynapseStorage::getDatasetProject")
2878    @retry(
2879        stop=stop_after_attempt(5),
2880        wait=wait_chain(
2881            *[wait_fixed(10) for i in range(2)]
2882            + [wait_fixed(15) for i in range(2)]
2883            + [wait_fixed(20)]
2884        ),
2885        retry=retry_if_exception_type(LookupError),
2886        retry_error_callback=raise_final_error,
2887    )
2888    def getDatasetProject(self, datasetId: str) -> str:
2889        """Get parent project for a given dataset ID.
2890
2891        Args:
2892            datasetId (str): Synapse entity ID (folder or project).
2893
2894        Raises:
2895            ValueError: Raised if Synapse ID cannot be retrieved
2896            by the user or if it doesn't appear in the file view.
2897
2898        Returns:
2899            str: The Synapse ID for the parent project.
2900        """
2901
2902        # Subset main file view
2903        dataset_index = self.storageFileviewTable["id"] == datasetId
2904        dataset_row = self.storageFileviewTable[dataset_index]
2905
2906        # re-query if no datasets found
2907        if dataset_row.empty:
2908            sleep(5)
2909            self.query_fileview(force_requery=True)
2910            # Subset main file view
2911            dataset_index = self.storageFileviewTable["id"] == datasetId
2912            dataset_row = self.storageFileviewTable[dataset_index]
2913
2914        # Return `projectId` for given row if only one found
2915        if len(dataset_row) == 1:
2916            dataset_project = dataset_row["projectId"].values[0]
2917            return dataset_project
2918
2919        # Otherwise, check if already project itself
2920        try:
2921            syn_object = self.synapse_entity_tracker.get(
2922                synapse_id=datasetId, syn=self.syn, download_file=False
2923            )
2924            if syn_object.properties["concreteType"].endswith("Project"):
2925                return datasetId
2926        except SynapseHTTPError:
2927            raise PermissionError(
2928                f"The given dataset ({datasetId}) isn't accessible with this "
2929                "user. This might be caused by a typo in the dataset Synapse ID."
2930            )
2931
2932        # If not, then assume dataset not in file view
2933        raise LookupError(
2934            f"The given dataset ({datasetId}) doesn't appear in the "
2935            f"configured file view ({self.storageFileview}). This might "
2936            "mean that the file view's scope needs to be updated."
2937        )

Get parent project for a given dataset ID.

Arguments:
  • datasetId (str): Synapse entity ID (folder or project).
Raises:
  • ValueError: Raised if Synapse ID cannot be retrieved
  • by the user or if it doesn't appear in the file view.
Returns:

str: The Synapse ID for the parent project.

def getDatasetAnnotationsBatch( self, datasetId: str, dataset_file_ids: Sequence[str] = None) -> pandas.core.frame.DataFrame:
2939    def getDatasetAnnotationsBatch(
2940        self, datasetId: str, dataset_file_ids: Sequence[str] = None
2941    ) -> pd.DataFrame:
2942        """Generate table for annotations across all files in given dataset.
2943        This function uses a temporary file view to generate a table
2944        instead of iteratively querying for individual entity annotations.
2945        This function is expected to run much faster than
2946        `self.getDatasetAnnotationsBatch` on large datasets.
2947
2948        Args:
2949            datasetId (str): Synapse ID for dataset folder.
2950            dataset_file_ids (Sequence[str]): List of Synapse IDs
2951                for dataset files/folders used to subset the table.
2952
2953        Returns:
2954            pd.DataFrame: Table of annotations.
2955        """
2956        # Create data frame from annotations file view
2957        with DatasetFileView(datasetId, self.syn) as fileview:
2958            table = fileview.query()
2959
2960        if dataset_file_ids:
2961            table = table.loc[table.index.intersection(dataset_file_ids)]
2962
2963        table = table.reset_index(drop=True)
2964
2965        return table

Generate table for annotations across all files in given dataset. This function uses a temporary file view to generate a table instead of iteratively querying for individual entity annotations. This function is expected to run much faster than self.getDatasetAnnotationsBatch on large datasets.

Arguments:
  • datasetId (str): Synapse ID for dataset folder.
  • dataset_file_ids (Sequence[str]): List of Synapse IDs for dataset files/folders used to subset the table.
Returns:

pd.DataFrame: Table of annotations.

class TableOperations:
2978class TableOperations:
2979    """
2980    Object to hold functions for various table operations specific to the Synapse Asset Store.
2981
2982    Currently implement operations are:
2983    createTable: upload a manifest as a new table when none exist
2984    replaceTable: replace a metadata in a table from one manifest with metadata from another manifest
2985    updateTable: add a column to a table that already exists on synapse
2986
2987    Operations currently in development are:
2988    upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest
2989    """
2990
2991    def __init__(
2992        self,
2993        synStore: SynapseStorage,
2994        tableToLoad: pd.DataFrame = None,
2995        tableName: str = None,
2996        datasetId: str = None,
2997        existingTableId: str = None,
2998        restrict: bool = False,
2999        synapse_entity_tracker: SynapseEntityTracker = None,
3000    ):
3001        """
3002        Class governing table operations (creation, replacement, upserts, updates) in schematic
3003
3004        tableToLoad: manifest formatted appropriately for the table
3005        tableName: name of the table to be uploaded
3006        datasetId: synID of the dataset for the manifest
3007        existingTableId: synId of the table currently exising on synapse (if there is one)
3008        restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
3009        synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
3010
3011        """
3012        self.synStore = synStore
3013        self.tableToLoad = tableToLoad
3014        self.tableName = tableName
3015        self.datasetId = datasetId
3016        self.existingTableId = existingTableId
3017        self.restrict = restrict
3018        self.synapse_entity_tracker = synapse_entity_tracker or SynapseEntityTracker()
3019
3020    @tracer.start_as_current_span("TableOperations::createTable")
3021    def createTable(
3022        self,
3023        columnTypeDict: dict = None,
3024        specifySchema: bool = True,
3025    ):
3026        """
3027        Method to create a table from a metadata manifest and upload it to synapse
3028
3029        Args:
3030            columnTypeDict: dictionary schema for table columns: type, size, etc
3031            specifySchema: to specify a specific schema for the table format
3032
3033        Returns:
3034            table.schema.id: synID of the newly created table
3035        """
3036        datasetEntity = self.synapse_entity_tracker.get(
3037            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3038        )
3039        datasetName = datasetEntity.name
3040        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3041
3042        if not self.tableName:
3043            self.tableName = datasetName + "table"
3044        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3045        if specifySchema:
3046            if columnTypeDict == {}:
3047                logger.error("Did not provide a columnTypeDict.")
3048            # create list of columns:
3049            cols = []
3050            for col in self.tableToLoad.columns:
3051                if col in table_schema_by_cname:
3052                    col_type = table_schema_by_cname[col]["columnType"]
3053                    max_size = (
3054                        table_schema_by_cname[col]["maximumSize"]
3055                        if "maximumSize" in table_schema_by_cname[col].keys()
3056                        else 100
3057                    )
3058                    max_list_len = 250
3059                    if max_size and max_list_len:
3060                        cols.append(
3061                            Column(
3062                                name=col,
3063                                columnType=col_type,
3064                                maximumSize=max_size,
3065                                maximumListLength=max_list_len,
3066                            )
3067                        )
3068                    elif max_size:
3069                        cols.append(
3070                            Column(name=col, columnType=col_type, maximumSize=max_size)
3071                        )
3072                    else:
3073                        cols.append(Column(name=col, columnType=col_type))
3074                else:
3075                    # TODO add warning that the given col was not found and it's max size is set to 100
3076                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3077            schema = Schema(
3078                name=self.tableName, columns=cols, parent=datasetParentProject
3079            )
3080            table = Table(schema, self.tableToLoad)
3081            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3082            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3083            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3084            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3085            return table.schema.id
3086        else:
3087            # For just uploading the tables to synapse using default
3088            # column types.
3089            table = build_table(self.tableName, datasetParentProject, self.tableToLoad)
3090            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3091            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3092            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3093            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3094            return table.schema.id
3095
3096    @tracer.start_as_current_span("TableOperations::replaceTable")
3097    def replaceTable(
3098        self,
3099        specifySchema: bool = True,
3100        columnTypeDict: dict = None,
3101    ):
3102        """
3103        Method to replace an existing table on synapse with metadata from a new manifest
3104
3105        Args:
3106            specifySchema: to infer a schema for the table format
3107            columnTypeDict: dictionary schema for table columns: type, size, etc
3108
3109        Returns:
3110           existingTableId: synID of the already existing table that had its metadata replaced
3111        """
3112        datasetEntity = self.synapse_entity_tracker.get(
3113            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3114        )
3115
3116        datasetName = datasetEntity.name
3117        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3118        existing_table, existing_results = self.synStore.get_synapse_table(
3119            self.existingTableId
3120        )
3121        # remove rows
3122        self.synStore.syn.delete(existing_results)
3123        # Data changes such as removing all rows causes the eTag to change.
3124        self.synapse_entity_tracker.remove(synapse_id=self.existingTableId)
3125        # wait for row deletion to finish on synapse before getting empty table
3126        sleep(10)
3127
3128        # removes all current columns
3129        current_table = self.synapse_entity_tracker.get(
3130            synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False
3131        )
3132
3133        current_columns = self.synStore.syn.getTableColumns(current_table)
3134        for col in current_columns:
3135            current_table.removeColumn(col)
3136
3137        if not self.tableName:
3138            self.tableName = datasetName + "table"
3139
3140        # Process columns according to manifest entries
3141        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3142        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3143        if specifySchema:
3144            if columnTypeDict == {}:
3145                logger.error("Did not provide a columnTypeDict.")
3146            # create list of columns:
3147            cols = []
3148
3149            for col in self.tableToLoad.columns:
3150                if col in table_schema_by_cname:
3151                    col_type = table_schema_by_cname[col]["columnType"]
3152                    max_size = (
3153                        table_schema_by_cname[col]["maximumSize"]
3154                        if "maximumSize" in table_schema_by_cname[col].keys()
3155                        else 100
3156                    )
3157                    max_list_len = 250
3158                    if max_size and max_list_len:
3159                        cols.append(
3160                            Column(
3161                                name=col,
3162                                columnType=col_type,
3163                                maximumSize=max_size,
3164                                maximumListLength=max_list_len,
3165                            )
3166                        )
3167                    elif max_size:
3168                        cols.append(
3169                            Column(name=col, columnType=col_type, maximumSize=max_size)
3170                        )
3171                    else:
3172                        cols.append(Column(name=col, columnType=col_type))
3173                else:
3174                    # TODO add warning that the given col was not found and it's max size is set to 100
3175                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3176
3177            # adds new columns to schema
3178            for col in cols:
3179                current_table.addColumn(col)
3180            table_result = self.synStore.syn.store(
3181                current_table, isRestricted=self.restrict
3182            )
3183            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3184            # self.synapse_entity_tracker.add(synapse_id=table_result.schema.id, entity=table_result.schema)
3185            self.synapse_entity_tracker.remove(synapse_id=table_result.id)
3186
3187            # wait for synapse store to finish
3188            sleep(1)
3189
3190            # build schema and table from columns and store with necessary restrictions
3191            schema = Schema(
3192                name=self.tableName, columns=cols, parent=datasetParentProject
3193            )
3194            schema.id = self.existingTableId
3195            table = Table(schema, self.tableToLoad, etag=existing_results.etag)
3196            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3197            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3198            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3199            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3200        else:
3201            logging.error("Must specify a schema for table replacements")
3202
3203        # remove system metadata from manifest
3204        existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True)
3205        return self.existingTableId
3206
3207    @tracer.start_as_current_span("TableOperations::_get_auth_token")
3208    def _get_auth_token(
3209        self,
3210    ):
3211        authtoken = None
3212
3213        # Get access token from environment variable if available
3214        # Primarily useful for testing environments, with other possible usefulness for containers
3215        env_access_token = os.getenv("SYNAPSE_ACCESS_TOKEN")
3216        if env_access_token:
3217            authtoken = env_access_token
3218            return authtoken
3219
3220        # Get token from authorization header
3221        # Primarily useful for API endpoint functionality
3222        if "Authorization" in self.synStore.syn.default_headers:
3223            authtoken = self.synStore.syn.default_headers["Authorization"].split(
3224                "Bearer "
3225            )[-1]
3226            return authtoken
3227
3228        # retrive credentials from synapse object
3229        # Primarily useful for local users, could only be stored here when a .synapseConfig file is used, but including to be safe
3230        synapse_object_creds = self.synStore.syn.credentials
3231        if hasattr(synapse_object_creds, "_token"):
3232            authtoken = synapse_object_creds.secret
3233
3234        # Try getting creds from .synapseConfig file if it exists
3235        # Primarily useful for local users. Seems to correlate with credentials stored in synaspe object when logged in
3236        if os.path.exists(CONFIG.synapse_configuration_path):
3237            config = get_config_file(CONFIG.synapse_configuration_path)
3238
3239            # check which credentials are provided in file
3240            if config.has_option("authentication", "authtoken"):
3241                authtoken = config.get("authentication", "authtoken")
3242
3243        # raise error if required credentials are not found
3244        if not authtoken:
3245            raise NameError(
3246                "authtoken credentials could not be found in the environment, synapse object, or the .synapseConfig file"
3247            )
3248
3249        return authtoken
3250
3251    @tracer.start_as_current_span("TableOperations::upsertTable")
3252    def upsertTable(self, dmge: DataModelGraphExplorer):
3253        """
3254        Method to upsert rows from a new manifest into an existing table on synapse
3255        For upsert functionality to work, primary keys must follow the naming convention of <componenet>_id
3256        `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality.
3257        Currently it is required to use -dl/--use_display_label with table upserts.
3258
3259
3260        Args:
3261            dmge: DataModelGraphExplorer instance
3262
3263        Returns:
3264           existingTableId: synID of the already existing table that had its metadata replaced
3265        """
3266
3267        authtoken = self._get_auth_token()
3268
3269        synapseDB = SynapseDatabase(
3270            auth_token=authtoken,
3271            project_id=self.synStore.getDatasetProject(self.datasetId),
3272            syn=self.synStore.syn,
3273            synapse_entity_tracker=self.synapse_entity_tracker,
3274        )
3275
3276        try:
3277            # Try performing upsert
3278            synapseDB.upsert_table_rows(
3279                table_name=self.tableName, data=self.tableToLoad
3280            )
3281        except SynapseHTTPError as ex:
3282            # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload
3283            if "Id is not a valid column name or id" in str(ex):
3284                self._update_table_uuid_column(dmge)
3285                synapseDB.upsert_table_rows(
3286                    table_name=self.tableName, data=self.tableToLoad
3287                )
3288            # Raise if other error
3289            else:
3290                raise ex
3291
3292        return self.existingTableId
3293
3294    @tracer.start_as_current_span("TableOperations::_update_table_uuid_column")
3295    def _update_table_uuid_column(
3296        self,
3297        dmge: DataModelGraphExplorer,
3298    ) -> None:
3299        """Removes the `Uuid` column when present, and relpaces with an `Id` column
3300        Used to enable backwards compatability for manifests using the old `Uuid` convention
3301
3302        Args:
3303            dmge: DataModelGraphExplorer instance
3304
3305        Returns:
3306            None
3307        """
3308
3309        # Get the columns of the schema
3310        schema = self.synapse_entity_tracker.get(
3311            synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False
3312        )
3313
3314        cols = self.synStore.syn.getTableColumns(schema)
3315
3316        # Iterate through columns until `Uuid` column is found
3317        for col in cols:
3318            if col.name.lower() == "uuid":
3319                # See if schema has `Uuid` column specified
3320                try:
3321                    uuid_col_in_schema = dmge.is_class_in_schema(col.name)
3322                except KeyError:
3323                    uuid_col_in_schema = False
3324
3325                # If there is, then create a new `Id` column from scratch
3326                if uuid_col_in_schema:
3327                    new_col = Column(columnType="STRING", maximumSize=64, name="Id")
3328                    schema.addColumn(new_col)
3329                    schema = self.synStore.syn.store(schema)
3330                    # self.synapse_entity_tracker.add(synapse_id=schema.id, entity=schema)
3331                    # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3332                    self.synapse_entity_tracker.remove(synapse_id=schema.id)
3333                # If there is not, then use the old `Uuid` column as a basis for the new `Id` column
3334                else:
3335                    # Build ColumnModel that will be used for new column
3336                    id_column = Column(
3337                        name="Id",
3338                        columnType="STRING",
3339                        maximumSize=64,
3340                        defaultValue=None,
3341                        maximumListLength=1,
3342                    )
3343                    new_col_response = self.synStore.syn.store(id_column)
3344
3345                    # Define columnChange body
3346                    columnChangeDict = {
3347                        "concreteType": "org.sagebionetworks.repo.model.table.TableSchemaChangeRequest",
3348                        "entityId": self.existingTableId,
3349                        "changes": [
3350                            {
3351                                "oldColumnId": col["id"],
3352                                "newColumnId": new_col_response["id"],
3353                            }
3354                        ],
3355                    }
3356
3357                    self.synStore.syn._async_table_update(
3358                        table=self.existingTableId,
3359                        changes=[columnChangeDict],
3360                        wait=False,
3361                    )
3362                break
3363
3364        return
3365
3366    @tracer.start_as_current_span("TableOperations::updateTable")
3367    def updateTable(
3368        self,
3369        update_col: str = "Id",
3370    ):
3371        """
3372        Method to update an existing table with a new column
3373
3374        Args:
3375            updateCol: column to index the old and new tables on
3376
3377        Returns:
3378           existingTableId: synID of the already existing table that had its metadata replaced
3379        """
3380        existing_table, existing_results = self.synStore.get_synapse_table(
3381            self.existingTableId
3382        )
3383
3384        self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col)
3385        # store table with existing etag data and impose restrictions as appropriate
3386        table_result = self.synStore.syn.store(
3387            Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag),
3388            isRestricted=self.restrict,
3389        )
3390        # We cannot store the Table to the `synapse_entity_tracker` because there is
3391        # not `Schema` on the table object. The above `.store()` function call would
3392        # also update the ETag of the entity within Synapse. Remove it from the tracker
3393        # and re-retrieve it later on if needed again.
3394        self.synapse_entity_tracker.remove(synapse_id=table_result.tableId)
3395
3396        return self.existingTableId

Object to hold functions for various table operations specific to the Synapse Asset Store.

Currently implement operations are: createTable: upload a manifest as a new table when none exist replaceTable: replace a metadata in a table from one manifest with metadata from another manifest updateTable: add a column to a table that already exists on synapse

Operations currently in development are: upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest

TableOperations( synStore: SynapseStorage, tableToLoad: pandas.core.frame.DataFrame = None, tableName: str = None, datasetId: str = None, existingTableId: str = None, restrict: bool = False, synapse_entity_tracker: schematic.store.synapse_tracker.SynapseEntityTracker = None)
2991    def __init__(
2992        self,
2993        synStore: SynapseStorage,
2994        tableToLoad: pd.DataFrame = None,
2995        tableName: str = None,
2996        datasetId: str = None,
2997        existingTableId: str = None,
2998        restrict: bool = False,
2999        synapse_entity_tracker: SynapseEntityTracker = None,
3000    ):
3001        """
3002        Class governing table operations (creation, replacement, upserts, updates) in schematic
3003
3004        tableToLoad: manifest formatted appropriately for the table
3005        tableName: name of the table to be uploaded
3006        datasetId: synID of the dataset for the manifest
3007        existingTableId: synId of the table currently exising on synapse (if there is one)
3008        restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
3009        synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
3010
3011        """
3012        self.synStore = synStore
3013        self.tableToLoad = tableToLoad
3014        self.tableName = tableName
3015        self.datasetId = datasetId
3016        self.existingTableId = existingTableId
3017        self.restrict = restrict
3018        self.synapse_entity_tracker = synapse_entity_tracker or SynapseEntityTracker()

Class governing table operations (creation, replacement, upserts, updates) in schematic

tableToLoad: manifest formatted appropriately for the table tableName: name of the table to be uploaded datasetId: synID of the dataset for the manifest existingTableId: synId of the table currently exising on synapse (if there is one) restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities

synStore
tableToLoad
tableName
datasetId
existingTableId
restrict
synapse_entity_tracker
@tracer.start_as_current_span('TableOperations::createTable')
def createTable(self, columnTypeDict: dict = None, specifySchema: bool = True):
3020    @tracer.start_as_current_span("TableOperations::createTable")
3021    def createTable(
3022        self,
3023        columnTypeDict: dict = None,
3024        specifySchema: bool = True,
3025    ):
3026        """
3027        Method to create a table from a metadata manifest and upload it to synapse
3028
3029        Args:
3030            columnTypeDict: dictionary schema for table columns: type, size, etc
3031            specifySchema: to specify a specific schema for the table format
3032
3033        Returns:
3034            table.schema.id: synID of the newly created table
3035        """
3036        datasetEntity = self.synapse_entity_tracker.get(
3037            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3038        )
3039        datasetName = datasetEntity.name
3040        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3041
3042        if not self.tableName:
3043            self.tableName = datasetName + "table"
3044        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3045        if specifySchema:
3046            if columnTypeDict == {}:
3047                logger.error("Did not provide a columnTypeDict.")
3048            # create list of columns:
3049            cols = []
3050            for col in self.tableToLoad.columns:
3051                if col in table_schema_by_cname:
3052                    col_type = table_schema_by_cname[col]["columnType"]
3053                    max_size = (
3054                        table_schema_by_cname[col]["maximumSize"]
3055                        if "maximumSize" in table_schema_by_cname[col].keys()
3056                        else 100
3057                    )
3058                    max_list_len = 250
3059                    if max_size and max_list_len:
3060                        cols.append(
3061                            Column(
3062                                name=col,
3063                                columnType=col_type,
3064                                maximumSize=max_size,
3065                                maximumListLength=max_list_len,
3066                            )
3067                        )
3068                    elif max_size:
3069                        cols.append(
3070                            Column(name=col, columnType=col_type, maximumSize=max_size)
3071                        )
3072                    else:
3073                        cols.append(Column(name=col, columnType=col_type))
3074                else:
3075                    # TODO add warning that the given col was not found and it's max size is set to 100
3076                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3077            schema = Schema(
3078                name=self.tableName, columns=cols, parent=datasetParentProject
3079            )
3080            table = Table(schema, self.tableToLoad)
3081            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3082            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3083            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3084            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3085            return table.schema.id
3086        else:
3087            # For just uploading the tables to synapse using default
3088            # column types.
3089            table = build_table(self.tableName, datasetParentProject, self.tableToLoad)
3090            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3091            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3092            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3093            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3094            return table.schema.id

Method to create a table from a metadata manifest and upload it to synapse

Arguments:
  • columnTypeDict: dictionary schema for table columns: type, size, etc
  • specifySchema: to specify a specific schema for the table format
Returns:

table.schema.id: synID of the newly created table

@tracer.start_as_current_span('TableOperations::replaceTable')
def replaceTable(self, specifySchema: bool = True, columnTypeDict: dict = None):
3096    @tracer.start_as_current_span("TableOperations::replaceTable")
3097    def replaceTable(
3098        self,
3099        specifySchema: bool = True,
3100        columnTypeDict: dict = None,
3101    ):
3102        """
3103        Method to replace an existing table on synapse with metadata from a new manifest
3104
3105        Args:
3106            specifySchema: to infer a schema for the table format
3107            columnTypeDict: dictionary schema for table columns: type, size, etc
3108
3109        Returns:
3110           existingTableId: synID of the already existing table that had its metadata replaced
3111        """
3112        datasetEntity = self.synapse_entity_tracker.get(
3113            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3114        )
3115
3116        datasetName = datasetEntity.name
3117        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3118        existing_table, existing_results = self.synStore.get_synapse_table(
3119            self.existingTableId
3120        )
3121        # remove rows
3122        self.synStore.syn.delete(existing_results)
3123        # Data changes such as removing all rows causes the eTag to change.
3124        self.synapse_entity_tracker.remove(synapse_id=self.existingTableId)
3125        # wait for row deletion to finish on synapse before getting empty table
3126        sleep(10)
3127
3128        # removes all current columns
3129        current_table = self.synapse_entity_tracker.get(
3130            synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False
3131        )
3132
3133        current_columns = self.synStore.syn.getTableColumns(current_table)
3134        for col in current_columns:
3135            current_table.removeColumn(col)
3136
3137        if not self.tableName:
3138            self.tableName = datasetName + "table"
3139
3140        # Process columns according to manifest entries
3141        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3142        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3143        if specifySchema:
3144            if columnTypeDict == {}:
3145                logger.error("Did not provide a columnTypeDict.")
3146            # create list of columns:
3147            cols = []
3148
3149            for col in self.tableToLoad.columns:
3150                if col in table_schema_by_cname:
3151                    col_type = table_schema_by_cname[col]["columnType"]
3152                    max_size = (
3153                        table_schema_by_cname[col]["maximumSize"]
3154                        if "maximumSize" in table_schema_by_cname[col].keys()
3155                        else 100
3156                    )
3157                    max_list_len = 250
3158                    if max_size and max_list_len:
3159                        cols.append(
3160                            Column(
3161                                name=col,
3162                                columnType=col_type,
3163                                maximumSize=max_size,
3164                                maximumListLength=max_list_len,
3165                            )
3166                        )
3167                    elif max_size:
3168                        cols.append(
3169                            Column(name=col, columnType=col_type, maximumSize=max_size)
3170                        )
3171                    else:
3172                        cols.append(Column(name=col, columnType=col_type))
3173                else:
3174                    # TODO add warning that the given col was not found and it's max size is set to 100
3175                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3176
3177            # adds new columns to schema
3178            for col in cols:
3179                current_table.addColumn(col)
3180            table_result = self.synStore.syn.store(
3181                current_table, isRestricted=self.restrict
3182            )
3183            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3184            # self.synapse_entity_tracker.add(synapse_id=table_result.schema.id, entity=table_result.schema)
3185            self.synapse_entity_tracker.remove(synapse_id=table_result.id)
3186
3187            # wait for synapse store to finish
3188            sleep(1)
3189
3190            # build schema and table from columns and store with necessary restrictions
3191            schema = Schema(
3192                name=self.tableName, columns=cols, parent=datasetParentProject
3193            )
3194            schema.id = self.existingTableId
3195            table = Table(schema, self.tableToLoad, etag=existing_results.etag)
3196            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3197            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3198            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3199            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3200        else:
3201            logging.error("Must specify a schema for table replacements")
3202
3203        # remove system metadata from manifest
3204        existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True)
3205        return self.existingTableId

Method to replace an existing table on synapse with metadata from a new manifest

Arguments:
  • specifySchema: to infer a schema for the table format
  • columnTypeDict: dictionary schema for table columns: type, size, etc
Returns:

existingTableId: synID of the already existing table that had its metadata replaced

@tracer.start_as_current_span('TableOperations::upsertTable')
def upsertTable( self, dmge: schematic.schemas.data_model_graph.DataModelGraphExplorer):
3251    @tracer.start_as_current_span("TableOperations::upsertTable")
3252    def upsertTable(self, dmge: DataModelGraphExplorer):
3253        """
3254        Method to upsert rows from a new manifest into an existing table on synapse
3255        For upsert functionality to work, primary keys must follow the naming convention of <componenet>_id
3256        `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality.
3257        Currently it is required to use -dl/--use_display_label with table upserts.
3258
3259
3260        Args:
3261            dmge: DataModelGraphExplorer instance
3262
3263        Returns:
3264           existingTableId: synID of the already existing table that had its metadata replaced
3265        """
3266
3267        authtoken = self._get_auth_token()
3268
3269        synapseDB = SynapseDatabase(
3270            auth_token=authtoken,
3271            project_id=self.synStore.getDatasetProject(self.datasetId),
3272            syn=self.synStore.syn,
3273            synapse_entity_tracker=self.synapse_entity_tracker,
3274        )
3275
3276        try:
3277            # Try performing upsert
3278            synapseDB.upsert_table_rows(
3279                table_name=self.tableName, data=self.tableToLoad
3280            )
3281        except SynapseHTTPError as ex:
3282            # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload
3283            if "Id is not a valid column name or id" in str(ex):
3284                self._update_table_uuid_column(dmge)
3285                synapseDB.upsert_table_rows(
3286                    table_name=self.tableName, data=self.tableToLoad
3287                )
3288            # Raise if other error
3289            else:
3290                raise ex
3291
3292        return self.existingTableId

Method to upsert rows from a new manifest into an existing table on synapse For upsert functionality to work, primary keys must follow the naming convention of _id -tm upsert should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality. Currently it is required to use -dl/--use_display_label with table upserts.

Arguments:
  • dmge: DataModelGraphExplorer instance
Returns:

existingTableId: synID of the already existing table that had its metadata replaced

@tracer.start_as_current_span('TableOperations::updateTable')
def updateTable(self, update_col: str = 'Id'):
3366    @tracer.start_as_current_span("TableOperations::updateTable")
3367    def updateTable(
3368        self,
3369        update_col: str = "Id",
3370    ):
3371        """
3372        Method to update an existing table with a new column
3373
3374        Args:
3375            updateCol: column to index the old and new tables on
3376
3377        Returns:
3378           existingTableId: synID of the already existing table that had its metadata replaced
3379        """
3380        existing_table, existing_results = self.synStore.get_synapse_table(
3381            self.existingTableId
3382        )
3383
3384        self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col)
3385        # store table with existing etag data and impose restrictions as appropriate
3386        table_result = self.synStore.syn.store(
3387            Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag),
3388            isRestricted=self.restrict,
3389        )
3390        # We cannot store the Table to the `synapse_entity_tracker` because there is
3391        # not `Schema` on the table object. The above `.store()` function call would
3392        # also update the ETag of the entity within Synapse. Remove it from the tracker
3393        # and re-retrieve it later on if needed again.
3394        self.synapse_entity_tracker.remove(synapse_id=table_result.tableId)
3395
3396        return self.existingTableId

Method to update an existing table with a new column

Arguments:
  • updateCol: column to index the old and new tables on
Returns:

existingTableId: synID of the already existing table that had its metadata replaced

class DatasetFileView:
3399class DatasetFileView:
3400    """Helper class to create temporary dataset file views.
3401    This class can be used in conjunction with a 'with' statement.
3402    This will ensure that the file view is deleted automatically.
3403    See SynapseStorage.getDatasetAnnotationsBatch for example usage.
3404    """
3405
3406    def __init__(
3407        self,
3408        datasetId: str,
3409        synapse: Synapse,
3410        name: str = None,
3411        temporary: bool = True,
3412        parentId: str = None,
3413    ) -> None:
3414        """Create a file view scoped to a dataset folder.
3415
3416        Args:
3417            datasetId (str): Synapse ID for a dataset folder/project.
3418            synapse (Synapse): Used for Synapse requests.
3419            name (str): Name of the file view (temporary or not).
3420            temporary (bool): Whether to delete the file view on exit
3421                of either a 'with' statement or Python entirely.
3422            parentId (str, optional): Synapse ID specifying where to
3423                store the file view. Defaults to datasetId.
3424        """
3425
3426        self.datasetId = datasetId
3427        self.synapse = synapse
3428        self.is_temporary = temporary
3429
3430        if name is None:
3431            self.name = f"schematic annotation file view for {self.datasetId}"
3432
3433        if self.is_temporary:
3434            uid = secrets.token_urlsafe(5)
3435            self.name = f"{self.name} - UID {uid}"
3436
3437        # TODO: Allow a DCC admin to configure a "universal parent"
3438        #       Such as a Synapse project writeable by everyone.
3439        self.parentId = datasetId if parentId is None else parentId
3440
3441        # TODO: Create local sharing setting to hide from everyone else
3442        view_schema = EntityViewSchema(
3443            name=self.name,
3444            parent=self.parentId,
3445            scopes=self.datasetId,
3446            includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER],
3447            addDefaultViewColumns=False,
3448            addAnnotationColumns=True,
3449        )
3450
3451        # TODO: Handle failure due to insufficient permissions by
3452        #       creating a temporary new project to store view
3453        self.view_schema = self.synapse.store(view_schema)
3454
3455        # These are filled in after calling `self.query()`
3456        self.results = None
3457        self.table = None
3458
3459        # Ensure deletion of the file view (last resort)
3460        if self.is_temporary:
3461            atexit.register(self.delete)
3462
3463    def __enter__(self):
3464        """Return file view when entering 'with' statement."""
3465        return self
3466
3467    def __exit__(self, exc_type, exc_value, traceback):
3468        """Delete file view when exiting 'with' statement."""
3469        if self.is_temporary:
3470            self.delete()
3471
3472    def delete(self):
3473        """Delete the file view on Synapse without deleting local table."""
3474        if self.view_schema is not None:
3475            self.synapse.delete(self.view_schema)
3476            self.view_schema = None
3477
3478    def query(self, tidy=True, force=False):
3479        """Retrieve file view as a data frame (raw format sans index)."""
3480        if self.table is None or force:
3481            fileview_id = self.view_schema["id"]
3482            self.results = self.synapse.tableQuery(f"select * from {fileview_id}")
3483            self.table = self.results.asDataFrame(
3484                rowIdAndVersionInIndex=False,
3485                na_values=STR_NA_VALUES_FILTERED,
3486                keep_default_na=False,
3487            )
3488        if tidy:
3489            self.tidy_table()
3490        return self.table
3491
3492    def tidy_table(self):
3493        """Convert raw file view data frame into more usable format."""
3494        assert self.table is not None, "Must call `self.query()` first."
3495        self._fix_default_columns()
3496        self._fix_list_columns()
3497        self._fix_int_columns()
3498        return self.table
3499
3500    def _fix_default_columns(self):
3501        """Rename default columns to match schematic expectations."""
3502
3503        # Drop ROW_VERSION column if present
3504        if "ROW_VERSION" in self.table:
3505            del self.table["ROW_VERSION"]
3506
3507        # Rename id column to entityId and set as data frame index
3508        if "ROW_ID" in self.table:
3509            self.table["entityId"] = "syn" + self.table["ROW_ID"].astype(str)
3510            self.table = self.table.set_index("entityId", drop=False)
3511            del self.table["ROW_ID"]
3512
3513        # Rename ROW_ETAG column to eTag and place at end of data frame
3514        if "ROW_ETAG" in self.table:
3515            row_etags = self.table.pop("ROW_ETAG")
3516
3517            # eTag column may already present if users annotated data without submitting manifest
3518            # we're only concerned with the new values and not the existing ones
3519            if "eTag" in self.table:
3520                del self.table["eTag"]
3521
3522            self.table.insert(len(self.table.columns), "eTag", row_etags)
3523
3524        return self.table
3525
3526    def _get_columns_of_type(self, types):
3527        """Helper function to get list of columns of a given type(s)."""
3528        matching_columns = []
3529        for header in self.results.headers:
3530            if header.columnType in types:
3531                matching_columns.append(header.name)
3532        return matching_columns
3533
3534    def _fix_list_columns(self):
3535        """Fix formatting of list-columns."""
3536        list_types = {"STRING_LIST", "INTEGER_LIST", "BOOLEAN_LIST"}
3537        list_columns = self._get_columns_of_type(list_types)
3538        for col in list_columns:
3539            self.table[col] = self.table[col].apply(lambda x: ", ".join(x))
3540        return self.table
3541
3542    def _fix_int_columns(self):
3543        """Ensure that integer-columns are actually integers."""
3544        int_columns = self._get_columns_of_type({"INTEGER"})
3545        for col in int_columns:
3546            # Coercing to string because NaN is a floating point value
3547            # and cannot exist alongside integers in a column
3548            def to_int_fn(x):
3549                return "" if np.isnan(x) else str(int(x))
3550
3551            self.table[col] = self.table[col].apply(to_int_fn)
3552        return self.table

Helper class to create temporary dataset file views. This class can be used in conjunction with a 'with' statement. This will ensure that the file view is deleted automatically. See SynapseStorage.getDatasetAnnotationsBatch for example usage.

DatasetFileView( datasetId: str, synapse: synapseclient.client.Synapse, name: str = None, temporary: bool = True, parentId: str = None)
3406    def __init__(
3407        self,
3408        datasetId: str,
3409        synapse: Synapse,
3410        name: str = None,
3411        temporary: bool = True,
3412        parentId: str = None,
3413    ) -> None:
3414        """Create a file view scoped to a dataset folder.
3415
3416        Args:
3417            datasetId (str): Synapse ID for a dataset folder/project.
3418            synapse (Synapse): Used for Synapse requests.
3419            name (str): Name of the file view (temporary or not).
3420            temporary (bool): Whether to delete the file view on exit
3421                of either a 'with' statement or Python entirely.
3422            parentId (str, optional): Synapse ID specifying where to
3423                store the file view. Defaults to datasetId.
3424        """
3425
3426        self.datasetId = datasetId
3427        self.synapse = synapse
3428        self.is_temporary = temporary
3429
3430        if name is None:
3431            self.name = f"schematic annotation file view for {self.datasetId}"
3432
3433        if self.is_temporary:
3434            uid = secrets.token_urlsafe(5)
3435            self.name = f"{self.name} - UID {uid}"
3436
3437        # TODO: Allow a DCC admin to configure a "universal parent"
3438        #       Such as a Synapse project writeable by everyone.
3439        self.parentId = datasetId if parentId is None else parentId
3440
3441        # TODO: Create local sharing setting to hide from everyone else
3442        view_schema = EntityViewSchema(
3443            name=self.name,
3444            parent=self.parentId,
3445            scopes=self.datasetId,
3446            includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER],
3447            addDefaultViewColumns=False,
3448            addAnnotationColumns=True,
3449        )
3450
3451        # TODO: Handle failure due to insufficient permissions by
3452        #       creating a temporary new project to store view
3453        self.view_schema = self.synapse.store(view_schema)
3454
3455        # These are filled in after calling `self.query()`
3456        self.results = None
3457        self.table = None
3458
3459        # Ensure deletion of the file view (last resort)
3460        if self.is_temporary:
3461            atexit.register(self.delete)

Create a file view scoped to a dataset folder.

Arguments:
  • datasetId (str): Synapse ID for a dataset folder/project.
  • synapse (Synapse): Used for Synapse requests.
  • name (str): Name of the file view (temporary or not).
  • temporary (bool): Whether to delete the file view on exit of either a 'with' statement or Python entirely.
  • parentId (str, optional): Synapse ID specifying where to store the file view. Defaults to datasetId.
datasetId
synapse
is_temporary
parentId
view_schema
results
table
def delete(self):
3472    def delete(self):
3473        """Delete the file view on Synapse without deleting local table."""
3474        if self.view_schema is not None:
3475            self.synapse.delete(self.view_schema)
3476            self.view_schema = None

Delete the file view on Synapse without deleting local table.

def query(self, tidy=True, force=False):
3478    def query(self, tidy=True, force=False):
3479        """Retrieve file view as a data frame (raw format sans index)."""
3480        if self.table is None or force:
3481            fileview_id = self.view_schema["id"]
3482            self.results = self.synapse.tableQuery(f"select * from {fileview_id}")
3483            self.table = self.results.asDataFrame(
3484                rowIdAndVersionInIndex=False,
3485                na_values=STR_NA_VALUES_FILTERED,
3486                keep_default_na=False,
3487            )
3488        if tidy:
3489            self.tidy_table()
3490        return self.table

Retrieve file view as a data frame (raw format sans index).

def tidy_table(self):
3492    def tidy_table(self):
3493        """Convert raw file view data frame into more usable format."""
3494        assert self.table is not None, "Must call `self.query()` first."
3495        self._fix_default_columns()
3496        self._fix_list_columns()
3497        self._fix_int_columns()
3498        return self.table

Convert raw file view data frame into more usable format.