schematic.store.synapse

Synapse storage class

   1"""Synapse storage class"""
   2
   3import asyncio
   4import atexit
   5import logging
   6import os
   7import re
   8import secrets
   9import shutil
  10import time
  11import uuid  # used to generate unique names for entities
  12from copy import deepcopy
  13from dataclasses import dataclass, field
  14from time import sleep
  15
  16# allows specifying explicit variable types
  17from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
  18
  19import numpy as np
  20import pandas as pd
  21import synapseclient
  22from opentelemetry import trace
  23from synapseclient import Annotations as OldAnnotations
  24from synapseclient import (
  25    Column,
  26    EntityViewSchema,
  27    EntityViewType,
  28    File,
  29    Folder,
  30    Schema,
  31    Synapse,
  32    Table,
  33    as_table_columns,
  34)
  35from synapseclient.annotations import _convert_to_annotations_list
  36from synapseclient.api import get_config_file, get_entity_id_bundle2
  37from synapseclient.core.constants.concrete_types import PROJECT_ENTITY
  38from synapseclient.core.exceptions import (
  39    SynapseAuthenticationError,
  40    SynapseHTTPError,
  41    SynapseUnmetAccessRestrictions,
  42)
  43from synapseclient.models.annotations import Annotations
  44from synapseclient.table import CsvFileTable, Schema, build_table
  45from tenacity import (
  46    retry,
  47    retry_if_exception_type,
  48    stop_after_attempt,
  49    wait_chain,
  50    wait_fixed,
  51)
  52
  53from schematic.configuration.configuration import CONFIG
  54from schematic.exceptions import AccessCredentialsError
  55from schematic.schemas.data_model_graph import DataModelGraphExplorer
  56from schematic.store.base import BaseStorage
  57from schematic.store.database.synapse_database import SynapseDatabase
  58from schematic.store.synapse_tracker import SynapseEntityTracker
  59from schematic.utils.df_utils import (
  60    STR_NA_VALUES_FILTERED,
  61    col_in_dataframe,
  62    load_df,
  63    update_df,
  64)
  65
  66# entity_type_mapping, get_dir_size, create_temp_folder, check_synapse_cache_size, and clear_synapse_cache functions are used for AWS deployment
  67# Please do not remove these import statements
  68from schematic.utils.general import (
  69    check_synapse_cache_size,
  70    clear_synapse_cache,
  71    create_temp_folder,
  72    entity_type_mapping,
  73    get_dir_size,
  74    create_like_statement,
  75)
  76from schematic.utils.io_utils import cleanup_temporary_storage
  77from schematic.utils.schema_utils import get_class_label_from_display_name
  78from schematic.utils.validate_utils import comma_separated_list_regex, rule_in_rule_list
  79
  80
  81logger = logging.getLogger("Synapse storage")
  82
  83tracer = trace.get_tracer("Schematic")
  84
  85ID_COLUMN = "Id"
  86ENTITY_ID_COLUMN = "entityId"
  87UUID_COLUMN = "uuid"
  88
  89
  90@dataclass
  91class ManifestDownload(object):
  92    """
  93    syn: an object of type synapseclient.
  94    manifest_id: id of a manifest
  95    synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
  96    """
  97
  98    syn: synapseclient.Synapse
  99    manifest_id: str
 100    synapse_entity_tracker: SynapseEntityTracker = field(
 101        default_factory=SynapseEntityTracker
 102    )
 103
 104    def _download_manifest_to_folder(self, use_temporary_folder: bool = True) -> File:
 105        """
 106        Try downloading a manifest to a specific folder (temporary or not). When the
 107        `use_temporary_folder` is set to True, the manifest will be downloaded to a
 108        temporary folder. This is useful for when the code is running as an API server
 109        where multiple requests are being made at the same time. This will prevent
 110        multiple requests from overwriting the same manifest file. When the
 111        `use_temporary_folder` is set to False, the manifest will be downloaded to the
 112        default manifest folder.
 113
 114        Args:
 115            use_temporary_folder: boolean argument indicating if a temporary folder
 116                should be used to store the manifest file. This is useful when running
 117                this code as an API server where multiple requests could be made at the
 118                same time. This is set to False when the code is being used from the
 119                CLI. Defaults to True.
 120
 121        Return:
 122            manifest_data: A Synapse file entity of the downloaded manifest
 123        """
 124        manifest_data = self.synapse_entity_tracker.get(
 125            synapse_id=self.manifest_id,
 126            syn=self.syn,
 127            download_file=False,
 128            retrieve_if_not_present=False,
 129        )
 130        current_span = trace.get_current_span()
 131        if (
 132            manifest_data
 133            and (file_handle := manifest_data.get("_file_handle", None))
 134            and current_span.is_recording()
 135        ):
 136            current_span.set_attribute(
 137                "schematic.manifest_size", file_handle.get("contentSize", 0)
 138            )
 139
 140        if manifest_data and manifest_data.path:
 141            return manifest_data
 142
 143        if "SECRETS_MANAGER_SECRETS" in os.environ:
 144            temporary_manifest_storage = "/var/tmp/temp_manifest_download"
 145            cleanup_temporary_storage(
 146                temporary_manifest_storage, time_delta_seconds=3600
 147            )
 148            # create a new directory to store manifest
 149            if not os.path.exists(temporary_manifest_storage):
 150                os.mkdir(temporary_manifest_storage)
 151            # create temporary folders for storing manifests
 152            download_location = create_temp_folder(
 153                path=temporary_manifest_storage,
 154                prefix=f"{self.manifest_id}-{time.time()}-",
 155            )
 156        else:
 157            if use_temporary_folder:
 158                download_location = create_temp_folder(
 159                    path=CONFIG.manifest_folder,
 160                    prefix=f"{self.manifest_id}-{time.time()}-",
 161                )
 162            else:
 163                download_location = CONFIG.manifest_folder
 164
 165        manifest_data = self.synapse_entity_tracker.get(
 166            synapse_id=self.manifest_id,
 167            syn=self.syn,
 168            download_file=True,
 169            retrieve_if_not_present=True,
 170            download_location=download_location,
 171        )
 172
 173        # This is doing a rename of the downloaded file. The reason this is important
 174        # is that if we are re-using a file that was previously downloaded, but the
 175        # file had been renamed. The file downloaded from the Synapse client is just
 176        # a direct copy of that renamed file. This code will set the name of the file
 177        # to the original name that was used to download the file. Note: An MD5 checksum
 178        # of the file will still be performed so if the file has changed, it will be
 179        # downloaded again.
 180        filename = manifest_data._file_handle.fileName
 181        if filename != os.path.basename(manifest_data.path):
 182            parent_folder = os.path.dirname(manifest_data.path)
 183            manifest_original_name_and_path = os.path.join(parent_folder, filename)
 184
 185            self.syn.cache.remove(
 186                file_handle_id=manifest_data.dataFileHandleId, path=manifest_data.path
 187            )
 188            os.rename(manifest_data.path, manifest_original_name_and_path)
 189            manifest_data.path = manifest_original_name_and_path
 190            self.syn.cache.add(
 191                file_handle_id=manifest_data.dataFileHandleId,
 192                path=manifest_original_name_and_path,
 193                md5=manifest_data._file_handle.contentMd5,
 194            )
 195
 196        return manifest_data
 197
 198    def _entity_type_checking(self) -> str:
 199        """
 200        check the entity type of the id that needs to be downloaded
 201        Return:
 202             if the entity type is wrong, raise an error
 203        """
 204        # check the type of entity
 205        entity_type = entity_type_mapping(
 206            syn=self.syn,
 207            entity_id=self.manifest_id,
 208            synapse_entity_tracker=self.synapse_entity_tracker,
 209        )
 210        if entity_type != "file":
 211            logger.error(
 212                f"You are using entity type: {entity_type}. Please provide a file ID"
 213            )
 214
 215    def download_manifest(
 216        self,
 217        newManifestName: str = "",
 218        manifest_df: pd.DataFrame = pd.DataFrame(),
 219        use_temporary_folder: bool = True,
 220    ) -> Union[str, File]:
 221        """
 222        Download a manifest based on a given manifest id.
 223        Args:
 224            newManifestName(optional): new name of a manifest that gets downloaded.
 225            manifest_df(optional): a dataframe containing name and id of manifests in a given asset view
 226        Return:
 227            manifest_data: synapse entity file object
 228        """
 229
 230        # enables retrying if user does not have access to uncensored manifest
 231        # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location
 232        manifest_data = ""
 233
 234        # check entity type
 235        self._entity_type_checking()
 236
 237        # download a manifest
 238        try:
 239            manifest_data = self._download_manifest_to_folder(
 240                use_temporary_folder=use_temporary_folder
 241            )
 242        except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError):
 243            # if there's an error getting an uncensored manifest, try getting the censored manifest
 244            if not manifest_df.empty:
 245                censored_regex = re.compile(".*censored.*")
 246                censored = manifest_df["name"].str.contains(censored_regex)
 247                new_manifest_id = manifest_df[censored]["id"][0]
 248                self.manifest_id = new_manifest_id
 249                try:
 250                    manifest_data = self._download_manifest_to_folder(
 251                        use_temporary_folder=use_temporary_folder
 252                    )
 253                except (
 254                    SynapseUnmetAccessRestrictions,
 255                    SynapseAuthenticationError,
 256                ) as e:
 257                    raise PermissionError(
 258                        "You don't have access to censored and uncensored manifests in this dataset."
 259                    ) from e
 260            else:
 261                logger.error(
 262                    f"You don't have access to the requested resource: {self.manifest_id}"
 263                )
 264
 265        if newManifestName and os.path.exists(manifest_data.get("path")):
 266            # Rename the file we just made to the new name
 267            new_manifest_filename = newManifestName + ".csv"
 268
 269            # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest.
 270            parent_folder = os.path.dirname(manifest_data.get("path"))
 271
 272            new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename)
 273
 274            # Copy file to new location. The purpose of using a copy instead of a rename
 275            # is to avoid any potential issues with the file being used in another
 276            # process. This avoids any potential race or code cocurrency conditions.
 277            shutil.copyfile(src=manifest_data["path"], dst=new_manifest_path_name)
 278
 279            # Adding this to cache will allow us to re-use the already downloaded
 280            # manifest file for up to 1 hour.
 281            self.syn.cache.add(
 282                file_handle_id=manifest_data.dataFileHandleId,
 283                path=new_manifest_path_name,
 284                md5=manifest_data._file_handle.contentMd5,
 285            )
 286
 287            # Update file names/paths in manifest_data
 288            manifest_data["name"] = new_manifest_filename
 289            manifest_data["filename"] = new_manifest_filename
 290            manifest_data["path"] = new_manifest_path_name
 291
 292        return manifest_data
 293
 294
 295class SynapseStorage(BaseStorage):
 296    """Implementation of Storage interface for datasets/files stored on Synapse.
 297    Provides utilities to list files in a specific project; update files annotations, create fileviews, etc.
 298
 299    TODO: Need to define the interface and rename and/or refactor some of the methods below.
 300    """
 301
 302    @tracer.start_as_current_span("SynapseStorage::__init__")
 303    def __init__(
 304        self,
 305        token: Optional[str] = None,  # optional parameter retrieved from browser cookie
 306        access_token: Optional[str] = None,
 307        project_scope: Optional[list] = None,
 308        synapse_cache_path: Optional[str] = None,
 309        perform_query: Optional[bool] = True,
 310        columns: Optional[list] = None,
 311        where_clauses: Optional[list] = None,
 312    ) -> None:
 313        """Initializes a SynapseStorage object.
 314
 315        Args:
 316            token (Optional[str], optional):
 317              Optional token parameter as found in browser cookie upon login to synapse.
 318              Defaults to None.
 319            access_token (Optional[list], optional):
 320              Optional access token (personal or oauth).
 321              Defaults to None.
 322            project_scope (Optional[list], optional): Defaults to None.
 323            synapse_cache_path (Optional[str], optional):
 324              Location of synapse cache.
 325              Defaults to None.
 326        TODO:
 327            Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how `query_fileview` is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands.
 328        """
 329        self.syn = self.login(synapse_cache_path, access_token)
 330        self.project_scope = project_scope
 331        self.storageFileview = CONFIG.synapse_master_fileview_id
 332        self.manifest = CONFIG.synapse_manifest_basename
 333        self.root_synapse_cache = self.syn.cache.cache_root_dir
 334        self.synapse_entity_tracker = SynapseEntityTracker()
 335        if perform_query:
 336            self.query_fileview(columns=columns, where_clauses=where_clauses)
 337
 338    # TODO: When moving this over to a regular cron-job the following logic should be
 339    # out of `manifest_download`:
 340    # if "SECRETS_MANAGER_SECRETS" in os.environ:
 341    #     temporary_manifest_storage = "/var/tmp/temp_manifest_download"
 342    #     cleanup_temporary_storage(temporary_manifest_storage, time_delta_seconds=3600)
 343    @tracer.start_as_current_span("SynapseStorage::_purge_synapse_cache")
 344    def _purge_synapse_cache(
 345        self, maximum_storage_allowed_cache_gb: int = 1, minute_buffer: int = 15
 346    ) -> None:
 347        """
 348        Purge synapse cache if it exceeds a certain size. Default to 1GB.
 349        Args:
 350            maximum_storage_allowed_cache_gb (int): the maximum storage allowed
 351              before purging cache. Default is 1 GB.
 352            minute_buffer (int): All files created this amount of time or older will be deleted
 353        """
 354        # try clearing the cache
 355        # scan a directory and check size of files
 356        if os.path.exists(self.root_synapse_cache):
 357            maximum_storage_allowed_cache_bytes = maximum_storage_allowed_cache_gb * (
 358                1024**3
 359            )
 360            nbytes = get_dir_size(self.root_synapse_cache)
 361            dir_size_bytes = check_synapse_cache_size(directory=self.root_synapse_cache)
 362            # if 1 GB has already been taken, purge cache before 15 min
 363            if dir_size_bytes >= maximum_storage_allowed_cache_bytes:
 364                num_of_deleted_files = clear_synapse_cache(
 365                    self.syn.cache, minutes=minute_buffer
 366                )
 367                logger.info(
 368                    f"{num_of_deleted_files}  files have been deleted from {self.root_synapse_cache}"
 369                )
 370            else:
 371                # on AWS, OS takes around 14-17% of our ephemeral storage (20GiB)
 372                # instead of guessing how much space that we left, print out .synapseCache here
 373                logger.info(f"the total size of .synapseCache is: {nbytes} bytes")
 374
 375    @tracer.start_as_current_span("SynapseStorage::query_fileview")
 376    def query_fileview(
 377        self,
 378        columns: Optional[list] = None,
 379        where_clauses: Optional[list] = None,
 380        force_requery: Optional[bool] = False,
 381    ) -> None:
 382        """
 383        Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute.
 384        Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes.
 385        Args:
 386            columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
 387            where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
 388            force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False.
 389        """
 390        self._purge_synapse_cache()
 391
 392        # Initialize to assume that the new fileview query will be different from what may already be stored. Initializes to True because generally one will not have already been performed
 393        self.new_query_different = True
 394
 395        # If a query has already been performed, store the query
 396        previous_query_built = hasattr(self, "fileview_query")
 397        if previous_query_built:
 398            previous_query = self.fileview_query
 399
 400        # Build a query with the current given parameters and check to see if it is different from the previous
 401        self._build_query(columns=columns, where_clauses=where_clauses)
 402        if previous_query_built:
 403            self.new_query_different = self.fileview_query != previous_query
 404
 405        # Only perform the query if it is different from the previous query or we are forcing new results to be retrieved
 406        if self.new_query_different or force_requery:
 407            try:
 408                self.storageFileviewTable = self.syn.tableQuery(
 409                    query=self.fileview_query,
 410                ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
 411            except SynapseHTTPError as exc:
 412                exception_text = str(exc)
 413                if "Unknown column path" in exception_text:
 414                    raise ValueError(
 415                        "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation."
 416                    )
 417                elif "Unknown column" in exception_text:
 418                    missing_column = exception_text.split("Unknown column ")[-1]
 419                    raise ValueError(
 420                        f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview."
 421                    )
 422                else:
 423                    raise AccessCredentialsError(self.storageFileview)
 424
 425    @staticmethod
 426    def build_clause_from_dataset_id(
 427        dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None
 428    ) -> str:
 429        """
 430        Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized.
 431        Args:
 432            dataset_id: Synapse ID of a dataset that should be used to limit the query
 433            dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query
 434        Returns:
 435            clause for the query or an empty string if no dataset ID is provided
 436        """
 437        # Calling this method without specifying synIDs will complete but will not scope the view
 438        if (not dataset_id) and (not dataset_folder_list):
 439            return ""
 440
 441        # This will be used to gather files under a dataset recursively with a fileview query instead of walking
 442        if dataset_folder_list:
 443            search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list)
 444            return f"parentId IN ({search_folders})"
 445
 446        # `dataset_id` should be provided when all files are stored directly under the dataset folder
 447        return f"parentId='{dataset_id}'"
 448
 449    def _build_query(
 450        self, columns: Optional[list] = None, where_clauses: Optional[list] = None
 451    ):
 452        """
 453        Method to build a query for Synapse FileViews
 454        Args:
 455            columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
 456            where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
 457            self.storageFileview (str): Synapse FileView ID
 458            self.project_scope (Optional[list], optional): List of project IDs to be used to scope the query. Defaults to None.
 459                Gets added to where_clauses, more included for backwards compatability and as a more user friendly way of subsetting the view in a simple way.
 460        """
 461        if columns is None:
 462            columns = []
 463        if where_clauses is None:
 464            where_clauses = []
 465
 466        if self.project_scope:
 467            project_scope_clause = f"projectId IN {tuple(self.project_scope + [''])}"
 468            where_clauses.append(project_scope_clause)
 469
 470        if where_clauses:
 471            where_clauses = " AND ".join(where_clauses)
 472            where_clauses = f"WHERE {where_clauses} ;"
 473        else:
 474            where_clauses = ";"
 475
 476        if columns:
 477            columns = ",".join(columns)
 478        else:
 479            columns = "*"
 480
 481        self.fileview_query = (
 482            f"SELECT {columns} FROM {self.storageFileview} {where_clauses}"
 483        )
 484
 485        return
 486
 487    @staticmethod
 488    @tracer.start_as_current_span("SynapseStorage::login")
 489    def login(
 490        synapse_cache_path: Optional[str] = None,
 491        access_token: Optional[str] = None,
 492    ) -> synapseclient.Synapse:
 493        """Login to Synapse
 494
 495        Args:
 496            access_token (Optional[str], optional): A synapse access token. Defaults to None.
 497            synapse_cache_path (Optional[str]): location of synapse cache
 498
 499        Raises:
 500            ValueError: If unable to loging with access token
 501
 502        Returns:
 503            synapseclient.Synapse: A Synapse object that is logged in
 504        """
 505        if not access_token:
 506            access_token = os.getenv("SYNAPSE_ACCESS_TOKEN")
 507
 508        # login using a token
 509        if access_token:
 510            try:
 511                syn = synapseclient.Synapse(
 512                    cache_root_dir=synapse_cache_path,
 513                    debug=False,
 514                    skip_checks=True,
 515                    cache_client=False,
 516                )
 517                syn.login(authToken=access_token, silent=True)
 518            except SynapseHTTPError as exc:
 519                raise ValueError(
 520                    "No access to resources. Please make sure that your token is correct"
 521                ) from exc
 522        else:
 523            # login using synapse credentials provided by user in .synapseConfig (default) file
 524            syn = synapseclient.Synapse(
 525                configPath=CONFIG.synapse_configuration_path,
 526                cache_root_dir=synapse_cache_path,
 527                debug=False,
 528                skip_checks=True,
 529                cache_client=False,
 530            )
 531            syn.login(silent=True)
 532
 533        # set user id attribute
 534        current_span = trace.get_current_span()
 535        if current_span.is_recording():
 536            current_span.set_attribute("user.id", syn.credentials.owner_id)
 537
 538        return syn
 539
 540    def missing_entity_handler(method):
 541        def wrapper(*args, **kwargs):
 542            try:
 543                return method(*args, **kwargs)
 544            except SynapseHTTPError as ex:
 545                str_message = str(ex).replace("\n", "")
 546                if "trash" in str_message or "does not exist" in str_message:
 547                    logging.warning(str_message)
 548                    return None
 549                else:
 550                    raise ex
 551
 552        return wrapper
 553
 554    def async_missing_entity_handler(method):
 555        """Decorator to handle missing entities in async methods."""
 556
 557        async def wrapper(*args: Any, **kwargs: Any) -> Any:
 558            try:
 559                return await method(*args, **kwargs)
 560            except SynapseHTTPError as ex:
 561                str_message = str(ex).replace("\n", "")
 562                if "trash" in str_message or "does not exist" in str_message:
 563                    logging.warning(str_message)
 564                    return None
 565                else:
 566                    raise ex
 567
 568        return wrapper
 569
 570    def getStorageFileviewTable(self):
 571        """Returns the storageFileviewTable obtained during initialization."""
 572        return self.storageFileviewTable
 573
 574    def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]:
 575        """Gets the paginated results of the REST call to Synapse to check what projects the current user has access to.
 576
 577        Args:
 578            currentUserId: synapse id for the user whose projects we want to get.
 579
 580        Returns:
 581            A dictionary with a next page token and the results.
 582        """
 583        all_results = self.syn.restGET(
 584            "/projects/user/{principalId}".format(principalId=currentUserId)
 585        )
 586
 587        while (
 588            "nextPageToken" in all_results
 589        ):  # iterate over next page token in results while there is any
 590            results_token = self.syn.restGET(
 591                "/projects/user/{principalId}?nextPageToken={nextPageToken}".format(
 592                    principalId=currentUserId,
 593                    nextPageToken=all_results["nextPageToken"],
 594                )
 595            )
 596            all_results["results"].extend(results_token["results"])
 597
 598            if "nextPageToken" in results_token:
 599                all_results["nextPageToken"] = results_token["nextPageToken"]
 600            else:
 601                del all_results["nextPageToken"]
 602
 603        return all_results
 604
 605    @tracer.start_as_current_span("SynapseStorage::getStorageProjects")
 606    def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]:
 607        """Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute.
 608
 609        Returns:
 610            A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName).
 611        """
 612
 613        # get the set of all storage Synapse project accessible for this pipeline
 614        storageProjects = self.storageFileviewTable["projectId"].unique()
 615
 616        # get the set of storage Synapse project accessible for this user
 617        # get a list of projects from Synapse
 618        current_user_project_headers = self.synapse_entity_tracker.get_project_headers(
 619            current_user_id=self.syn.credentials.owner_id, syn=self.syn
 620        )
 621        project_id_to_name_dict = {}
 622        current_user_projects = []
 623        for project_header in current_user_project_headers:
 624            project_id_to_name_dict[project_header.get("id")] = project_header.get(
 625                "name"
 626            )
 627            current_user_projects.append(project_header.get("id"))
 628
 629        # find set of user projects that are also in this pipeline's storage projects set
 630        storageProjects = list(set(storageProjects) & set(current_user_projects))
 631
 632        # Limit projects to scope if specified
 633        if project_scope:
 634            storageProjects = list(set(storageProjects) & set(project_scope))
 635
 636            if not storageProjects:
 637                raise Warning(
 638                    f"There are no projects that the user has access to that match the criteria of the specified project scope: {project_scope}"
 639                )
 640
 641        # prepare a return list of project IDs and names
 642        projects = []
 643        for projectId in storageProjects:
 644            project_name_from_project_header = project_id_to_name_dict.get(projectId)
 645            projects.append((projectId, project_name_from_project_header))
 646
 647        sorted_projects_list = sorted(projects, key=lambda tup: tup[0])
 648
 649        return sorted_projects_list
 650
 651    @tracer.start_as_current_span("SynapseStorage::getStorageDatasetsInProject")
 652    def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]:
 653        """Gets all datasets in folder under a given storage project that the current user has access to.
 654
 655        Args:
 656            projectId: synapse ID of a storage project.
 657
 658        Returns:
 659            A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName).
 660            None: If the projectId cannot be found on Synapse.
 661        """
 662
 663        # select all folders and fetch their names from within the storage project;
 664        # if folder content type is defined, only select folders that contain datasets
 665        if "contentType" in self.storageFileviewTable.columns:
 666            foldersTable = self.storageFileviewTable[
 667                (self.storageFileviewTable["contentType"] == "dataset")
 668                & (self.storageFileviewTable["projectId"] == projectId)
 669            ]
 670        else:
 671            foldersTable = self.storageFileviewTable[
 672                (self.storageFileviewTable["type"] == "folder")
 673                & (self.storageFileviewTable["parentId"] == projectId)
 674            ]
 675
 676        # get an array of tuples (folderId, folderName)
 677        # some folders are part of datasets; others contain datasets
 678        # each dataset parent is the project; folders part of a dataset have another folder as a parent
 679        # to get folders if and only if they contain datasets for each folder
 680        # check if folder's parent is the project; if so that folder contains a dataset,
 681        # unless the folder list has already been filtered to dataset folders based on contentType attribute above
 682
 683        datasetList = []
 684        folderProperties = ["id", "name"]
 685        for folder in list(
 686            foldersTable[folderProperties].itertuples(index=False, name=None)
 687        ):
 688            datasetList.append(folder)
 689
 690        sorted_dataset_list = sorted(datasetList, key=lambda tup: tup[0])
 691
 692        return sorted_dataset_list
 693
 694    @tracer.start_as_current_span("SynapseStorage::getFilesInStorageDataset")
 695    def getFilesInStorageDataset(
 696        self, datasetId: str, fileNames: List = None, fullpath: bool = True
 697    ) -> List[Tuple[str, str]]:
 698        """Gets all files (excluding manifest files) in a given dataset folder.
 699
 700        Args:
 701            datasetId: synapse ID of a storage dataset.
 702            fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g.
 703            metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present.
 704            fullpath: if True return the full path as part of this filename; otherwise return just base filename
 705
 706        Returns:
 707            A list of files; the list consists of tuples (fileId, fileName).
 708
 709        Raises:
 710            ValueError: Dataset ID not found.
 711        """
 712        file_list = []
 713
 714        # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view
 715        if self.storageFileviewTable.empty:
 716            raise ValueError(
 717                f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again."
 718            )
 719        child_path = self.storageFileviewTable.loc[
 720            self.storageFileviewTable["parentId"] == datasetId, "path"
 721        ]
 722        if child_path.empty:
 723            raise LookupError(
 724                f"Dataset {datasetId} could not be found in fileview {self.storageFileview}."
 725            )
 726        child_path = child_path.iloc[0]
 727
 728        # Get the dataset path by eliminating the child's portion of the path to account for nested datasets
 729        parent = child_path.split("/")[:-1]
 730        parent = "/".join(parent)
 731
 732        # When querying, only include files to exclude entity files and subdirectories
 733        where_clauses = [create_like_statement(parent), "type='file'"]
 734
 735        # Requery the fileview to specifically get the files in the given dataset
 736        self.query_fileview(columns=["id", "path"], where_clauses=where_clauses)
 737
 738        # Exclude manifest files
 739        non_manifest_files = self.storageFileviewTable.loc[
 740            ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"),
 741            :,
 742        ]
 743
 744        # Remove all files that are not in the list of fileNames
 745        if fileNames:
 746            filename_regex = "|".join(fileNames)
 747
 748            matching_files = non_manifest_files["path"].str.contains(
 749                filename_regex, case=False, regex=True
 750            )
 751
 752            non_manifest_files = non_manifest_files.loc[matching_files, :]
 753
 754        # Truncate path if necessary
 755        if not fullpath:
 756            non_manifest_files.path = non_manifest_files.path.apply(os.path.basename)
 757
 758        # Return list of files as expected by other methods
 759        file_list = list(non_manifest_files.itertuples(index=False, name=None))
 760
 761        return file_list
 762
 763    def _get_manifest_id(self, manifest: pd.DataFrame) -> str:
 764        """If both censored and uncensored manifests are present, return uncensored manifest; if only one manifest is present, return manifest id of that manifest; if more than two manifests are present, return the manifest id of the first one.
 765        Args:
 766        manifest: a dataframe contains name and id of manifests in a given asset view
 767
 768        Return:
 769        manifest_syn_id: id of a given censored or uncensored manifest
 770        """
 771        censored_regex = re.compile(".*censored.*")
 772        censored = manifest["name"].str.contains(censored_regex)
 773        if any(censored):
 774            # Try to use uncensored manifest first
 775            not_censored = ~censored
 776            if any(not_censored):
 777                manifest_syn_id = manifest[not_censored]["id"].iloc[0]
 778            # if only censored manifests are available, just use the first censored manifest
 779            else:
 780                manifest_syn_id = manifest["id"].iloc[0]
 781
 782        # otherwise, use the first (implied only) version that exists
 783        else:
 784            manifest_syn_id = manifest["id"].iloc[0]
 785
 786        return manifest_syn_id
 787
 788    @tracer.start_as_current_span("SynapseStorage::getDatasetManifest")
 789    def getDatasetManifest(
 790        self,
 791        datasetId: str,
 792        downloadFile: bool = False,
 793        newManifestName: str = "",
 794        use_temporary_folder: bool = True,
 795    ) -> Union[str, File]:
 796        """Gets the manifest associated with a given dataset.
 797
 798        Args:
 799            datasetId: synapse ID of a storage dataset.
 800            downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not.
 801            newManifestName: new name of a manifest that gets downloaded
 802            use_temporary_folder: boolean argument indicating if a temporary folder
 803                should be used to store the manifest file. This is useful when running
 804                this code as an API server where multiple requests could be made at the
 805                same time. This is set to False when the code is being used from the
 806                CLI. Defaults to True.
 807
 808        Returns:
 809            manifest_syn_id (String): Synapse ID of exisiting manifest file.
 810            manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True.
 811            "" (String): No pre-exisiting manifest in dataset.
 812        """
 813        manifest_data = ""
 814
 815        # get a list of files containing the manifest for this dataset (if any)
 816        all_files = self.storageFileviewTable
 817
 818        # construct regex based on manifest basename in the config
 819        manifest_re = re.compile(os.path.basename(self.manifest) + ".*.[tc]sv")
 820
 821        # search manifest based on given manifest basename regex above
 822        # and return a dataframe containing name and id of manifests in a given asset view
 823        manifest = all_files[
 824            (all_files["name"].str.contains(manifest_re, regex=True))
 825            & (all_files["parentId"] == datasetId)
 826        ]
 827
 828        manifest = manifest[["id", "name"]]
 829
 830        # if there is no pre-exisiting manifest in the specified dataset
 831        if manifest.empty:
 832            logger.warning(
 833                f"Could not find a manifest that fits basename {self.manifest} in asset view and dataset {datasetId}"
 834            )
 835            return ""
 836
 837        # if there is an exisiting manifest
 838        else:
 839            manifest_syn_id = self._get_manifest_id(manifest)
 840            if downloadFile:
 841                md = ManifestDownload(
 842                    self.syn,
 843                    manifest_id=manifest_syn_id,
 844                    synapse_entity_tracker=self.synapse_entity_tracker,
 845                )
 846                manifest_data = md.download_manifest(
 847                    newManifestName=newManifestName,
 848                    manifest_df=manifest,
 849                    use_temporary_folder=use_temporary_folder,
 850                )
 851                # TO DO: revisit how downstream code handle manifest_data. If the downstream code would break when manifest_data is an empty string,
 852                # then we should catch the error here without returning an empty string.
 853                if not manifest_data:
 854                    logger.debug(
 855                        f"No manifest data returned. Please check if you have successfully downloaded manifest: {manifest_syn_id}"
 856                    )
 857                return manifest_data
 858            return manifest_syn_id
 859
 860    def getDataTypeFromManifest(self, manifestId: str):
 861        """Fetch a manifest and return data types of all columns
 862        Args:
 863            manifestId: synapse ID of a manifest
 864        """
 865        # get manifest file path
 866        manifest_entity = self.synapse_entity_tracker.get(
 867            synapse_id=manifestId, syn=self.syn, download_file=True
 868        )
 869        manifest_filepath = manifest_entity.path
 870
 871        # load manifest dataframe
 872        manifest = load_df(
 873            manifest_filepath,
 874            preserve_raw_input=False,
 875            data_model=False,
 876        )
 877
 878        # convert the dataFrame to use best possible dtypes.
 879        manifest_new = manifest.convert_dtypes()
 880
 881        # get data types of columns
 882        result = manifest_new.dtypes.to_frame("dtypes").reset_index()
 883
 884        # return the result as a dictionary
 885        result_dict = result.set_index("index")["dtypes"].astype(str).to_dict()
 886
 887        return result_dict
 888
 889    def _get_files_metadata_from_dataset(
 890        self, datasetId: str, only_new_files: bool, manifest: pd.DataFrame = None
 891    ) -> Optional[dict]:
 892        """retrieve file ids under a particular datasetId
 893
 894        Args:
 895            datasetId (str): a dataset id
 896            only_new_files (bool): if only adding new files that are not already exist
 897            manifest (pd.DataFrame): metadata manifest dataframe. Default to None.
 898
 899        Returns:
 900            a dictionary that contains filename and entityid under a given datasetId or None if there is nothing under a given dataset id are not available
 901        """
 902        dataset_files = self.getFilesInStorageDataset(datasetId)
 903        if dataset_files:
 904            dataset_file_names_id_dict = self._get_file_entityIds(
 905                dataset_files, only_new_files=only_new_files, manifest=manifest
 906            )
 907            return dataset_file_names_id_dict
 908        else:
 909            return None
 910
 911    def add_entity_id_and_filename(
 912        self, datasetId: str, manifest: pd.DataFrame
 913    ) -> pd.DataFrame:
 914        """add entityid and filename column to an existing manifest assuming entityId column is not already present
 915
 916        Args:
 917            datasetId (str): dataset syn id
 918            manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty
 919
 920        Returns:
 921            pd.DataFrame: returns a pandas dataframe
 922        """
 923        # get file names and entity ids of a given dataset
 924        dataset_files_dict = self._get_files_metadata_from_dataset(
 925            datasetId, only_new_files=False
 926        )
 927
 928        if dataset_files_dict:
 929            # turn manifest dataframe back to a dictionary for operation
 930            manifest_dict = manifest.to_dict("list")
 931
 932            # update Filename column
 933            # add entityId column to the end
 934            manifest_dict.update(dataset_files_dict)
 935
 936            # if the component column exists in existing manifest, fill up that column
 937            if "Component" in manifest_dict.keys():
 938                manifest_dict["Component"] = manifest_dict["Component"] * max(
 939                    1, len(manifest_dict["Filename"])
 940                )
 941
 942            # turn dictionary back to a dataframe
 943            manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient="index")
 944            manifest_df_updated = manifest_df_index.transpose()
 945
 946            # fill na with empty string
 947            manifest_df_updated = manifest_df_updated.fillna("")
 948
 949            # drop index
 950            manifest_df_updated = manifest_df_updated.reset_index(drop=True)
 951
 952            return manifest_df_updated
 953        else:
 954            return manifest
 955
 956    def fill_in_entity_id_filename(
 957        self, datasetId: str, manifest: pd.DataFrame
 958    ) -> Tuple[List, pd.DataFrame]:
 959        """fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present.
 960
 961        Args:
 962            datasetId (str): dataset syn id
 963            manifest (pd.DataFrame): existing manifest dataframe.
 964
 965        Returns:
 966            Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe
 967        """
 968        # get dataset file names and entity id as a list of tuple
 969        dataset_files = self.getFilesInStorageDataset(datasetId)
 970
 971        # update manifest with additional filenames, if any
 972        # note that if there is an existing manifest and there are files in the dataset
 973        # the columns Filename and entityId are assumed to be present in manifest schema
 974        # TODO: use idiomatic panda syntax
 975        if not dataset_files:
 976            manifest = manifest.fillna("")
 977            return dataset_files, manifest
 978
 979        all_files = self._get_file_entityIds(
 980            dataset_files=dataset_files, only_new_files=False, manifest=manifest
 981        )
 982        new_files = self._get_file_entityIds(
 983            dataset_files=dataset_files, only_new_files=True, manifest=manifest
 984        )
 985
 986        all_files = pd.DataFrame(all_files)
 987        new_files = pd.DataFrame(new_files)
 988
 989        # update manifest so that it contains new dataset files
 990        manifest = (
 991            pd.concat([manifest, new_files], sort=False)
 992            .reset_index()
 993            .drop("index", axis=1)
 994        )
 995
 996        # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata
 997        manifest_reindex = manifest.set_index("entityId")
 998        all_files_reindex = all_files.set_index("entityId")
 999        all_files_reindex_like_manifest = all_files_reindex.reindex_like(
1000            manifest_reindex
1001        )
1002
1003        # Check if individual file paths in manifest and from synapse match
1004        file_paths_match = (
1005            manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"]
1006        )
1007
1008        # If all the paths do not match, update the manifest with the filepaths from synapse
1009        if not file_paths_match.all():
1010            manifest_reindex.loc[
1011                ~file_paths_match, "Filename"
1012            ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"]
1013
1014            # reformat manifest for further use
1015            manifest = manifest_reindex.reset_index()
1016            entityIdCol = manifest.pop("entityId")
1017            manifest.insert(len(manifest.columns), "entityId", entityIdCol)
1018
1019        manifest = manifest.fillna("")
1020        return dataset_files, manifest
1021
1022    @tracer.start_as_current_span("SynapseStorage::updateDatasetManifestFiles")
1023    def updateDatasetManifestFiles(
1024        self, dmge: DataModelGraphExplorer, datasetId: str, store: bool = True
1025    ) -> Union[Tuple[str, pd.DataFrame], None]:
1026        """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any.
1027
1028        Args:
1029            dmge: DataModelGraphExplorer Instance
1030            datasetId: synapse ID of a storage dataset.
1031            store: if set to True store updated manifest in asset store; if set to False
1032            return a Pandas dataframe containing updated manifest but do not store to asset store
1033
1034
1035        Returns:
1036            Synapse ID of updated manifest and Pandas dataframe containing the updated manifest.
1037            If there is no existing manifest or if the manifest does not have an entityId column, return None
1038        """
1039
1040        # get existing manifest Synapse ID
1041        manifest_id = self.getDatasetManifest(datasetId)
1042
1043        # if there is no manifest return None
1044        if not manifest_id:
1045            return None
1046
1047        manifest_entity = self.synapse_entity_tracker.get(
1048            synapse_id=manifest_id, syn=self.syn, download_file=True
1049        )
1050        manifest_filepath = manifest_entity.path
1051        manifest = load_df(manifest_filepath)
1052
1053        # If the manifest does not have an entityId column, trigger a new manifest to be generated
1054        if "entityId" not in manifest.columns:
1055            return None
1056
1057        manifest_is_file_based = "Filename" in manifest.columns
1058
1059        if manifest_is_file_based:
1060            # update manifest with additional filenames, if any
1061            # note that if there is an existing manifest and there are files in the dataset
1062            # the columns Filename and entityId are assumed to be present in manifest schema
1063            # TODO: use idiomatic panda syntax
1064            dataset_files, manifest = self.fill_in_entity_id_filename(
1065                datasetId, manifest
1066            )
1067            if dataset_files:
1068                # update the manifest file, so that it contains the relevant entity IDs
1069                if store:
1070                    manifest.to_csv(manifest_filepath, index=False)
1071
1072                    # store manifest and update associated metadata with manifest on Synapse
1073                    manifest_id = self.associateMetadataWithFiles(
1074                        dmge, manifest_filepath, datasetId
1075                    )
1076
1077        return manifest_id, manifest
1078
1079    def _get_file_entityIds(
1080        self,
1081        dataset_files: List,
1082        only_new_files: bool = False,
1083        manifest: pd.DataFrame = None,
1084    ):
1085        """
1086        Get a dictionary of files in a dataset. Either files that are not in the current manifest or all files
1087
1088        Args:
1089            manifest: metadata manifest
1090            dataset_file: List of all files in a dataset
1091            only_new_files: boolean to control whether only new files are returned or all files in the dataset
1092        Returns:
1093            files: dictionary of file names and entityIDs, with scope as specified by `only_new_files`
1094        """
1095        files = {"Filename": [], "entityId": []}
1096
1097        if only_new_files:
1098            if manifest is None:
1099                raise UnboundLocalError(
1100                    "No manifest was passed in, a manifest is required when `only_new_files` is True."
1101                )
1102
1103            if "entityId" not in manifest.columns:
1104                raise ValueError(
1105                    "The manifest in your dataset and/or top level folder must contain the 'entityId' column. "
1106                    "Please generate an empty manifest without annotations, manually add annotations to the "
1107                    "appropriate files in the manifest, and then try again."
1108                )
1109
1110            # find new files (that are not in the current manifest) if any
1111            for file_id, file_name in dataset_files:
1112                if not file_id in manifest["entityId"].values:
1113                    files["Filename"].append(file_name)
1114                    files["entityId"].append(file_id)
1115        else:
1116            # get all files
1117            for file_id, file_name in dataset_files:
1118                files["Filename"].append(file_name)
1119                files["entityId"].append(file_id)
1120
1121        return files
1122
1123    @tracer.start_as_current_span("SynapseStorage::getProjectManifests")
1124    def getProjectManifests(
1125        self, projectId: str
1126    ) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]:
1127        """Gets all metadata manifest files across all datasets in a specified project.
1128
1129        Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest
1130                 as a list of tuples, one for each manifest:
1131                    [
1132                        (
1133                            (datasetId, dataName),
1134                            (manifestId, manifestName),
1135                            (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema
1136                        ),
1137                        ...
1138                    ]
1139
1140        TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface
1141        """
1142        component = None
1143        entity = None
1144        manifests = []
1145
1146        datasets = self.getStorageDatasetsInProject(projectId)
1147
1148        for datasetId, datasetName in datasets:
1149            # encode information about the manifest in a simple list (so that R clients can unpack it)
1150            # eventually can serialize differently
1151
1152            # Get synID of manifest for a dataset
1153            manifestId = self.getDatasetManifest(datasetId)
1154
1155            # If a manifest exists, get the annotations for it, else return base 'manifest' tuple
1156            if manifestId:
1157                annotations = self.getFileAnnotations(manifestId)
1158
1159                # If manifest has annotations specifying component, use that
1160                if annotations and "Component" in annotations:
1161                    component = annotations["Component"]
1162                    entity = self.synapse_entity_tracker.get(
1163                        synapse_id=manifestId, syn=self.syn, download_file=False
1164                    )
1165                    manifest_name = entity["properties"]["name"]
1166
1167                # otherwise download the manifest and parse for information
1168                elif not annotations or "Component" not in annotations:
1169                    logging.debug(
1170                        f"No component annotations have been found for manifest {manifestId}. "
1171                        "The manifest will be downloaded and parsed instead. "
1172                        "For increased speed, add component annotations to manifest."
1173                    )
1174
1175                    manifest_info = self.getDatasetManifest(
1176                        datasetId, downloadFile=True
1177                    )
1178                    manifest_name = manifest_info["properties"].get("name", "")
1179
1180                    if not manifest_name:
1181                        logger.error(f"Failed to download manifests from {datasetId}")
1182
1183                    manifest_path = manifest_info["path"]
1184
1185                    manifest_df = load_df(manifest_path)
1186
1187                    # Get component from component column if it exists
1188                    if (
1189                        "Component" in manifest_df
1190                        and not manifest_df["Component"].empty
1191                    ):
1192                        list(set(manifest_df["Component"]))
1193                        component = list(set(manifest_df["Component"]))
1194
1195                        # Added to address issues raised during DCA testing
1196                        if "" in component:
1197                            component.remove("")
1198
1199                        if len(component) == 1:
1200                            component = component[0]
1201                        elif len(component) > 1:
1202                            logging.warning(
1203                                f"Manifest {manifestId} is composed of multiple components. Schematic does not support mulit-component manifests at this time."
1204                                "Behavior of manifests with multiple components is undefined"
1205                            )
1206            else:
1207                manifest_name = ""
1208                component = None
1209            if component:
1210                manifest = (
1211                    (datasetId, datasetName),
1212                    (manifestId, manifest_name),
1213                    (component, component),
1214                )
1215            elif manifestId:
1216                logging.debug(
1217                    f"Manifest {manifestId} does not have an associated Component"
1218                )
1219                manifest = (
1220                    (datasetId, datasetName),
1221                    (manifestId, manifest_name),
1222                    ("", ""),
1223                )
1224            else:
1225                manifest = (
1226                    (datasetId, datasetName),
1227                    ("", ""),
1228                    ("", ""),
1229                )
1230
1231            if manifest:
1232                manifests.append(manifest)
1233
1234        return manifests
1235
1236    def upload_project_manifests_to_synapse(
1237        self, dmge: DataModelGraphExplorer, projectId: str
1238    ) -> List[str]:
1239        """Upload all metadata manifest files across all datasets in a specified project as tables in Synapse.
1240
1241        Returns: String of all the manifest_table_ids of all the manifests that have been loaded.
1242        """
1243
1244        manifests = []
1245        manifest_loaded = []
1246        datasets = self.getStorageDatasetsInProject(projectId)
1247
1248        for datasetId, datasetName in datasets:
1249            # encode information about the manifest in a simple list (so that R clients can unpack it)
1250            # eventually can serialize differently
1251
1252            manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1253
1254            manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1255            if manifest_info:
1256                manifest_id = manifest_info["properties"]["id"]
1257                manifest_name = manifest_info["properties"]["name"]
1258                manifest_path = manifest_info["path"]
1259                manifest_df = load_df(manifest_path)
1260                manifest_table_id = uploadDB(
1261                    dmge=dmge,
1262                    manifest=manifest,
1263                    datasetId=datasetId,
1264                    table_name=datasetName,
1265                )
1266                manifest_loaded.append(datasetName)
1267        return manifest_loaded
1268
1269    def upload_annotated_project_manifests_to_synapse(
1270        self, projectId: str, path_to_json_ld: str, dry_run: bool = False
1271    ) -> List[str]:
1272        """
1273        Purpose:
1274            For all manifests in a project, upload them as a table and add annotations manifest csv.
1275            Assumes the manifest is already present as a CSV in a dataset in the project.
1276
1277        """
1278        # Instantiate DataModelParser
1279        data_model_parser = DataModelParser(path_to_data_model=path_to_json_ld)
1280        # Parse Model
1281        parsed_data_model = data_model_parser.parse_model()
1282
1283        # Instantiate DataModelGraph
1284        data_model_grapher = DataModelGraph(parsed_data_model)
1285
1286        # Generate graph
1287        graph_data_model = data_model_grapher.generate_data_model_graph()
1288
1289        # Instantiate DataModelGraphExplorer
1290        dmge = DataModelGraphExplorer(graph_data_model)
1291
1292        manifests = []
1293        manifest_loaded = []
1294        datasets = self.getStorageDatasetsInProject(projectId)
1295        for datasetId, datasetName in datasets:
1296            # encode information about the manifest in a simple list (so that R clients can unpack it)
1297            # eventually can serialize differently
1298
1299            manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1300            manifests.append(manifest)
1301
1302            manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1303
1304            if manifest_info:
1305                manifest_id = manifest_info["properties"]["id"]
1306                manifest_name = manifest_info["properties"]["name"]
1307                manifest_path = manifest_info["path"]
1308                manifest = (
1309                    (datasetId, datasetName),
1310                    (manifest_id, manifest_name),
1311                    ("", ""),
1312                )
1313                if not dry_run:
1314                    self.associateMetadataWithFiles(
1315                        dmge, manifest_path, datasetId, manifest_record_type="table"
1316                    )
1317                manifest_loaded.append(manifest)
1318
1319        return manifests, manifest_loaded
1320
1321    def move_entities_to_new_project(
1322        self,
1323        projectId: str,
1324        newProjectId: str,
1325        returnEntities: bool = False,
1326        dry_run: bool = False,
1327    ):
1328        """
1329        For each manifest csv in a project, look for all the entitiy ids that are associated.
1330        Look up the entitiy in the files, move the entity to new project.
1331        """
1332
1333        manifests = []
1334        manifest_loaded = []
1335        datasets = self.getStorageDatasetsInProject(projectId)
1336        if datasets:
1337            for datasetId, datasetName in datasets:
1338                # encode information about the manifest in a simple list (so that R clients can unpack it)
1339                # eventually can serialize differently
1340
1341                manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1342                manifests.append(manifest)
1343
1344                manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1345                if manifest_info:
1346                    manifest_id = manifest_info["properties"]["id"]
1347                    manifest_name = manifest_info["properties"]["name"]
1348                    manifest_path = manifest_info["path"]
1349                    manifest_df = load_df(manifest_path)
1350
1351                    manifest = (
1352                        (datasetId, datasetName),
1353                        (manifest_id, manifest_name),
1354                        ("", ""),
1355                    )
1356                    manifest_loaded.append(manifest)
1357
1358                    annotation_entities = self.storageFileviewTable[
1359                        (self.storageFileviewTable["id"].isin(manifest_df["entityId"]))
1360                        & (self.storageFileviewTable["type"] == "folder")
1361                    ]["id"]
1362
1363                    if returnEntities:
1364                        for entityId in annotation_entities:
1365                            if not dry_run:
1366                                moved_entity = self.syn.move(entityId, datasetId)
1367                                self.synapse_entity_tracker.add(
1368                                    synapse_id=moved_entity.id, entity=moved_entity
1369                                )
1370                            else:
1371                                logging.info(
1372                                    f"{entityId} will be moved to folder {datasetId}."
1373                                )
1374                    else:
1375                        # generate project folder
1376                        archive_project_folder = Folder(
1377                            projectId + "_archive", parent=newProjectId
1378                        )
1379                        archive_project_folder = self.syn.store(archive_project_folder)
1380                        self.synapse_entity_tracker.add(
1381                            synapse_id=archive_project_folder.id,
1382                            entity=archive_project_folder,
1383                        )
1384
1385                        # generate dataset folder
1386                        dataset_archive_folder = Folder(
1387                            "_".join([datasetId, datasetName, "archive"]),
1388                            parent=archive_project_folder.id,
1389                        )
1390                        dataset_archive_folder = self.syn.store(dataset_archive_folder)
1391                        self.synapse_entity_tracker.add(
1392                            synapse_id=dataset_archive_folder.id,
1393                            entity=dataset_archive_folder,
1394                        )
1395
1396                        for entityId in annotation_entities:
1397                            # move entities to folder
1398                            if not dry_run:
1399                                moved_entity = self.syn.move(
1400                                    entityId, dataset_archive_folder.id
1401                                )
1402                                self.synapse_entity_tracker.add(
1403                                    synapse_id=moved_entity.id, entity=moved_entity
1404                                )
1405                            else:
1406                                logging.info(
1407                                    f"{entityId} will be moved to folder {dataset_archive_folder.id}."
1408                                )
1409        else:
1410            raise LookupError(
1411                f"No datasets were found in the specified project: {projectId}. Re-check specified master_fileview in CONFIG and retry."
1412            )
1413        return manifests, manifest_loaded
1414
1415    @tracer.start_as_current_span("SynapseStorage::get_synapse_table")
1416    def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable]:
1417        """Download synapse table as a pd dataframe; return table schema and etags as results too
1418
1419        Args:
1420            synapse_id: synapse ID of the table to query
1421        """
1422
1423        results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id))
1424        df = results.asDataFrame(
1425            rowIdAndVersionInIndex=False,
1426            na_values=STR_NA_VALUES_FILTERED,
1427            keep_default_na=False,
1428        )
1429
1430        return df, results
1431
1432    @missing_entity_handler
1433    @tracer.start_as_current_span("SynapseStorage::uploadDB")
1434    def uploadDB(
1435        self,
1436        dmge: DataModelGraphExplorer,
1437        manifest: pd.DataFrame,
1438        datasetId: str,
1439        table_name: str,
1440        restrict: bool = False,
1441        table_manipulation: str = "replace",
1442        table_column_names: str = "class_label",
1443    ):
1444        """
1445        Method to upload a database to an asset store. In synapse, this will upload a metadata table
1446
1447        Args:
1448            dmge: DataModelGraphExplorer object
1449            manifest: pd.Df manifest to upload
1450            datasetId: synID of the dataset for the manifest
1451            table_name: name of the table to be uploaded
1452            restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
1453            existingTableId: str of the synId of the existing table, if one already exists
1454            table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
1455            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
1456                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
1457                display label formatting.
1458        Returns:
1459            manifest_table_id: synID of the uploaded table
1460            manifest: the original manifset
1461            table_manifest: manifest formatted appropriately for the table
1462
1463        """
1464
1465        col_schema, table_manifest = self.formatDB(
1466            dmge=dmge, manifest=manifest, table_column_names=table_column_names
1467        )
1468
1469        manifest_table_id = self.buildDB(
1470            datasetId,
1471            table_name,
1472            col_schema,
1473            table_manifest,
1474            table_manipulation,
1475            dmge,
1476            restrict,
1477        )
1478
1479        return manifest_table_id, manifest, table_manifest
1480
1481    @tracer.start_as_current_span("SynapseStorage::formatDB")
1482    def formatDB(self, dmge, manifest, table_column_names):
1483        """
1484        Method to format a manifest appropriatly for upload as table
1485
1486        Args:
1487            dmge: DataModelGraphExplorer object
1488            manifest: pd.Df manifest to upload
1489            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
1490                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
1491                display label formatting.
1492        Returns:
1493            col_schema: schema for table columns: type, size, etc
1494            table_manifest: formatted manifest
1495
1496        """
1497        # Rename the manifest columns to display names to match fileview
1498
1499        blacklist_chars = ["(", ")", ".", " ", "-"]
1500        manifest_columns = manifest.columns.tolist()
1501
1502        table_manifest = deepcopy(manifest)
1503
1504        if table_column_names == "display_name":
1505            cols = table_manifest.columns
1506
1507        elif table_column_names == "display_label":
1508            cols = [
1509                str(col).translate({ord(x): "" for x in blacklist_chars})
1510                for col in manifest_columns
1511            ]
1512
1513        elif table_column_names == "class_label":
1514            cols = [
1515                get_class_label_from_display_name(str(col)).translate(
1516                    {ord(x): "" for x in blacklist_chars}
1517                )
1518                for col in manifest_columns
1519            ]
1520        else:
1521            ValueError(
1522                f"The provided table_column_name: {table_column_names} is not valid, please resubmit with an allowed value only."
1523            )
1524
1525        cols = list(map(lambda x: x.replace("EntityId", "entityId"), cols))
1526
1527        # Reset column names in table manifest
1528        table_manifest.columns = cols
1529
1530        # move entity id to end of df
1531        entity_col = table_manifest.pop("entityId")
1532        table_manifest.insert(len(table_manifest.columns), "entityId", entity_col)
1533
1534        # Get the column schema
1535        col_schema = as_table_columns(table_manifest)
1536
1537        # Set Id column length to 64 (for some reason not being auto set.)
1538        for i, col in enumerate(col_schema):
1539            if col["name"].lower() == "id":
1540                col_schema[i]["maximumSize"] = 64
1541
1542        return col_schema, table_manifest
1543
1544    @tracer.start_as_current_span("SynapseStorage::buildDB")
1545    def buildDB(
1546        self,
1547        datasetId: str,
1548        table_name: str,
1549        col_schema: List,
1550        table_manifest: pd.DataFrame,
1551        table_manipulation: str,
1552        dmge: DataModelGraphExplorer,
1553        restrict: bool = False,
1554    ):
1555        """
1556        Method to construct the table appropriately: create new table, replace existing, or upsert new into existing
1557        Calls TableOperations class to execute
1558
1559        Args:
1560            datasetId: synID of the dataset for the manifest
1561            table_name: name of the table to be uploaded
1562            col_schema: schema for table columns: type, size, etc from `formatDB`
1563            table_manifest: formatted manifest that can be uploaded as a table
1564            table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
1565            restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
1566
1567        Returns:
1568            manifest_table_id: synID of the uploaded table
1569
1570        """
1571        table_parent_id = self.getDatasetProject(datasetId=datasetId)
1572        existing_table_id = self.syn.findEntityId(
1573            name=table_name, parent=table_parent_id
1574        )
1575        tableOps = TableOperations(
1576            synStore=self,
1577            tableToLoad=table_manifest,
1578            tableName=table_name,
1579            datasetId=datasetId,
1580            existingTableId=existing_table_id,
1581            restrict=restrict,
1582            synapse_entity_tracker=self.synapse_entity_tracker,
1583        )
1584
1585        if not table_manipulation or existing_table_id is None:
1586            manifest_table_id = tableOps.createTable(
1587                columnTypeDict=col_schema,
1588                specifySchema=True,
1589            )
1590        elif existing_table_id is not None:
1591            if table_manipulation.lower() == "replace":
1592                manifest_table_id = tableOps.replaceTable(
1593                    specifySchema=True,
1594                    columnTypeDict=col_schema,
1595                )
1596            elif table_manipulation.lower() == "upsert":
1597                manifest_table_id = tableOps.upsertTable(
1598                    dmge=dmge,
1599                )
1600            elif table_manipulation.lower() == "update":
1601                manifest_table_id = tableOps.updateTable()
1602
1603        if table_manipulation and table_manipulation.lower() == "upsert":
1604            table_entity = self.synapse_entity_tracker.get(
1605                synapse_id=existing_table_id or manifest_table_id,
1606                syn=self.syn,
1607                download_file=False,
1608            )
1609            annos = OldAnnotations(
1610                id=table_entity.id,
1611                etag=table_entity.etag,
1612                values=table_entity.annotations,
1613            )
1614            annos["primary_key"] = table_manifest["Component"][0] + "_id"
1615            annos = self.syn.set_annotations(annos)
1616            table_entity.etag = annos.etag
1617            table_entity.annotations = annos
1618
1619        return manifest_table_id
1620
1621    @tracer.start_as_current_span("SynapseStorage::upload_manifest_file")
1622    def upload_manifest_file(
1623        self,
1624        manifest,
1625        metadataManifestPath,
1626        datasetId,
1627        restrict_manifest,
1628        component_name="",
1629    ):
1630        # Update manifest to have the new entityId column
1631        manifest.to_csv(metadataManifestPath, index=False)
1632
1633        # store manifest to Synapse as a CSV
1634        # update file name
1635        file_name_full = metadataManifestPath.split("/")[-1]
1636        file_extension = file_name_full.split(".")[-1]
1637
1638        # Differentiate "censored" and "uncensored" manifest
1639        if "censored" in file_name_full:
1640            file_name_new = (
1641                os.path.basename(CONFIG.synapse_manifest_basename)
1642                + "_"
1643                + component_name
1644                + "_censored"
1645                + "."
1646                + file_extension
1647            )
1648        else:
1649            file_name_new = (
1650                os.path.basename(CONFIG.synapse_manifest_basename)
1651                + "_"
1652                + component_name
1653                + "."
1654                + file_extension
1655            )
1656
1657        manifest_synapse_file = None
1658        try:
1659            # Rename the file to file_name_new then revert
1660            # This is to maintain the original file name in-case other code is
1661            # expecting that the file exists with the original name
1662            original_file_path = metadataManifestPath
1663            new_file_path = os.path.join(
1664                os.path.dirname(metadataManifestPath), file_name_new
1665            )
1666            os.rename(original_file_path, new_file_path)
1667
1668            manifest_synapse_file = self._store_file_for_manifest_upload(
1669                new_file_path=new_file_path,
1670                dataset_id=datasetId,
1671                existing_file_name=file_name_full,
1672                file_name_new=file_name_new,
1673                restrict_manifest=restrict_manifest,
1674            )
1675            manifest_synapse_file_id = manifest_synapse_file.id
1676
1677        finally:
1678            # Revert the file name back to the original
1679            os.rename(new_file_path, original_file_path)
1680
1681            if manifest_synapse_file:
1682                manifest_synapse_file.path = original_file_path
1683
1684        return manifest_synapse_file_id
1685
1686    def _store_file_for_manifest_upload(
1687        self,
1688        new_file_path: str,
1689        dataset_id: str,
1690        existing_file_name: str,
1691        file_name_new: str,
1692        restrict_manifest: bool,
1693    ) -> File:
1694        """Handles a create or update of a manifest file that is going to be uploaded.
1695        If we already have a copy of the Entity in memory we will update that instance,
1696        otherwise create a new File instance to be created in Synapse. Once stored
1697        this will add the file to the `synapse_entity_tracker` for future reference.
1698
1699        Args:
1700            new_file_path (str): The path to the new manifest file
1701            dataset_id (str): The Synapse ID of the dataset the manifest is associated with
1702            existing_file_name (str): The name of the existing file
1703            file_name_new (str): The name of the new file
1704            restrict_manifest (bool): Whether the manifest should be restricted
1705
1706        Returns:
1707            File: The stored manifest file
1708        """
1709        local_tracked_file_instance = (
1710            self.synapse_entity_tracker.search_local_by_parent_and_name(
1711                name=existing_file_name, parent_id=dataset_id
1712            )
1713            or self.synapse_entity_tracker.search_local_by_parent_and_name(
1714                name=file_name_new, parent_id=dataset_id
1715            )
1716        )
1717
1718        if local_tracked_file_instance:
1719            local_tracked_file_instance.path = new_file_path
1720            local_tracked_file_instance.description = (
1721                "Manifest for dataset " + dataset_id
1722            )
1723            manifest_synapse_file = local_tracked_file_instance
1724        else:
1725            manifest_synapse_file = File(
1726                path=new_file_path,
1727                description="Manifest for dataset " + dataset_id,
1728                parent=dataset_id,
1729                name=file_name_new,
1730            )
1731
1732        manifest_synapse_file = self.syn.store(
1733            manifest_synapse_file, isRestricted=restrict_manifest
1734        )
1735
1736        self.synapse_entity_tracker.add(
1737            synapse_id=manifest_synapse_file.id, entity=manifest_synapse_file
1738        )
1739        return manifest_synapse_file
1740
1741    async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]:
1742        """get annotations asynchronously
1743
1744        Args:
1745            synapse_id (str): synapse id of the entity that the annotation belongs
1746
1747        Returns:
1748            Dict[str, Any]: The requested entity bundle matching
1749            <https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html>
1750        """
1751        return await get_entity_id_bundle2(
1752            entity_id=synapse_id,
1753            request={"includeAnnotations": True},
1754            synapse_client=self.syn,
1755        )
1756
1757    async def store_async_annotation(self, annotation_dict: dict) -> Annotations:
1758        """store annotation in an async way
1759
1760        Args:
1761            annotation_dict (dict): annotation in a dictionary format
1762
1763        Returns:
1764            Annotations: The stored annotations.
1765        """
1766        annotation_data = Annotations.from_dict(
1767            synapse_annotations=annotation_dict["annotations"]["annotations"]
1768        )
1769        annotation_class = Annotations(
1770            annotations=annotation_data,
1771            etag=annotation_dict["annotations"]["etag"],
1772            id=annotation_dict["annotations"]["id"],
1773        )
1774        annotation_storage_result = await annotation_class.store_async(
1775            synapse_client=self.syn
1776        )
1777        local_entity = self.synapse_entity_tracker.get(
1778            synapse_id=annotation_dict["annotations"]["id"],
1779            syn=self.syn,
1780            download_file=False,
1781            retrieve_if_not_present=False,
1782        )
1783        if local_entity:
1784            local_entity.etag = annotation_storage_result.etag
1785            local_entity.annotations = annotation_storage_result
1786        return annotation_storage_result
1787
1788    def process_row_annotations(
1789        self,
1790        dmge: DataModelGraphExplorer,
1791        metadata_syn: Dict[str, Any],
1792        hide_blanks: bool,
1793        csv_list_regex: str,
1794        annos: Dict[str, Any],
1795        annotation_keys: str,
1796    ) -> Dict[str, Any]:
1797        """Processes metadata annotations based on the logic below:
1798        1. Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is:
1799            An empty or whitespace-only string.
1800            A NaN value (if the annotation is a float).
1801        if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key.
1802        if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key.
1803
1804        2. If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name".
1805        Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key.
1806
1807        3. For any other conditions, assigns the original value of anno_v to the annotation key (anno_k).
1808
1809        4. Returns the updated annotations dictionary.
1810
1811        Args:
1812            dmge (DataModelGraphExplorer): data model graph explorer
1813            metadata_syn (dict): metadata used for Synapse storage
1814            hideBlanks (bool): if true, does not upload annotation keys with blank values.
1815            csv_list_regex (str): Regex to match with comma separated list
1816            annos (Dict[str, Any]): dictionary of annotation returned from synapse
1817            annotation_keys (str): display_label/class_label
1818
1819        Returns:
1820            Dict[str, Any]: annotations as a dictionary
1821
1822        ```mermaid
1823        flowchart TD
1824            A[Start] --> C{Is anno_v empty, whitespace, or NaN?}
1825            C -- Yes --> D{Is hide_blanks True?}
1826            D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing]
1827            D -- No --> F[Assign empty string to annotation key]
1828            C -- No --> G{Is anno_v a string?}
1829            G -- No --> H[Assign original value of anno_v to annotation key]
1830            G -- Yes --> I{Does anno_v match csv_list_regex?}
1831            I -- Yes --> J[Get validation rule of anno_k]
1832            J --> K{Does the validation rule contain 'list'}
1833            K -- Yes --> L[Split anno_v by commas and assign as list]
1834            I -- No --> H
1835            K -- No --> H
1836        ```
1837        """
1838        for anno_k, anno_v in metadata_syn.items():
1839            # Remove keys with nan or empty string values or string that only contains white space from dict of annotations to be uploaded
1840            # if present on current data annotation
1841            if hide_blanks and (
1842                (isinstance(anno_v, str) and anno_v.strip() == "")
1843                or (isinstance(anno_v, float) and np.isnan(anno_v))
1844            ):
1845                annos["annotations"]["annotations"].pop(anno_k) if anno_k in annos[
1846                    "annotations"
1847                ]["annotations"].keys() else annos["annotations"]["annotations"]
1848                continue
1849
1850            # Otherwise save annotation as approrpriate
1851            if isinstance(anno_v, float) and np.isnan(anno_v):
1852                annos["annotations"]["annotations"][anno_k] = ""
1853                continue
1854
1855            # Handle strings that match the csv_list_regex and pass the validation rule
1856            if isinstance(anno_v, str) and re.fullmatch(csv_list_regex, anno_v):
1857                # Use a dictionary to dynamically choose the argument
1858                param = (
1859                    {"node_display_name": anno_k}
1860                    if annotation_keys == "display_label"
1861                    else {"node_label": anno_k}
1862                )
1863                node_validation_rules = dmge.get_node_validation_rules(**param)
1864
1865                if rule_in_rule_list("list", node_validation_rules):
1866                    annos["annotations"]["annotations"][anno_k] = anno_v.split(",")
1867                    continue
1868            # default: assign the original value
1869            annos["annotations"]["annotations"][anno_k] = anno_v
1870
1871        return annos
1872
1873    @async_missing_entity_handler
1874    async def format_row_annotations(
1875        self,
1876        dmge: DataModelGraphExplorer,
1877        row: pd.Series,
1878        entityId: str,
1879        hideBlanks: bool,
1880        annotation_keys: str,
1881    ) -> Union[None, Dict[str, Any]]:
1882        """Format row annotations
1883
1884        Args:
1885            dmge (DataModelGraphExplorer): data moodel graph explorer object
1886            row (pd.Series): row of the manifest
1887            entityId (str): entity id of the manifest
1888            hideBlanks (bool): when true, does not upload annotation keys with blank values. When false, upload Annotation keys with empty string values
1889            annotation_keys (str): display_label/class_label
1890
1891        Returns:
1892            Union[None, Dict[str,]]: if entity id is in trash can, return None. Otherwise, return the annotations
1893        """
1894        # prepare metadata for Synapse storage (resolve display name into a name that Synapse annotations support (e.g no spaces, parenthesis)
1895        # note: the removal of special characters, will apply only to annotation keys; we are not altering the manifest
1896        # this could create a divergence between manifest column and annotations. this should be ok for most use cases.
1897        # columns with special characters are outside of the schema
1898        metadataSyn = {}
1899        blacklist_chars = ["(", ")", ".", " ", "-"]
1900
1901        for k, v in row.to_dict().items():
1902            if annotation_keys == "display_label":
1903                keySyn = str(k).translate({ord(x): "" for x in blacklist_chars})
1904            elif annotation_keys == "class_label":
1905                keySyn = get_class_label_from_display_name(str(k)).translate(
1906                    {ord(x): "" for x in blacklist_chars}
1907                )
1908
1909            # Skip `Filename` and `ETag` columns when setting annotations
1910            if keySyn in ["Filename", "ETag", "eTag"]:
1911                continue
1912
1913            # truncate annotation values to 500 characters if the
1914            # size of values is greater than equal to 500 characters
1915            # add an explicit [truncatedByDataCuratorApp] message at the end
1916            # of every truncated message to indicate that the cell value
1917            # has been truncated
1918            if isinstance(v, str) and len(v) >= 500:
1919                v = v[0:472] + "[truncatedByDataCuratorApp]"
1920
1921            metadataSyn[keySyn] = v
1922
1923        # This will first check if the entity is already in memory, and if so, that
1924        # instance is used. Unfortunately, the expected return format needs to match
1925        # the Synapse API, so we need to convert the annotations to the expected format.
1926        entity = self.synapse_entity_tracker.get(
1927            synapse_id=entityId,
1928            syn=self.syn,
1929            download_file=False,
1930            retrieve_if_not_present=False,
1931        )
1932        if entity is not None:
1933            synapse_annotations = _convert_to_annotations_list(
1934                annotations=entity.annotations
1935            )
1936            annos = {
1937                "annotations": {
1938                    "id": entity.id,
1939                    "etag": entity.etag,
1940                    "annotations": synapse_annotations,
1941                }
1942            }
1943        else:
1944            annos = await self.get_async_annotation(entityId)
1945
1946        # set annotation(s) for the various objects/items in a dataset on Synapse
1947        csv_list_regex = comma_separated_list_regex()
1948
1949        annos = self.process_row_annotations(
1950            dmge=dmge,
1951            metadata_syn=metadataSyn,
1952            hide_blanks=hideBlanks,
1953            csv_list_regex=csv_list_regex,
1954            annos=annos,
1955            annotation_keys=annotation_keys,
1956        )
1957
1958        return annos
1959
1960    @missing_entity_handler
1961    @tracer.start_as_current_span("SynapseStorage::format_manifest_annotations")
1962    def format_manifest_annotations(self, manifest, manifest_synapse_id):
1963        """
1964        Set annotations for the manifest (as a whole) so they can be applied to the manifest table or csv.
1965        For now just getting the Component.
1966        """
1967
1968        entity = self.synapse_entity_tracker.get(
1969            synapse_id=manifest_synapse_id, syn=self.syn, download_file=False
1970        )
1971        is_file = entity.concreteType.endswith(".FileEntity")
1972        is_table = entity.concreteType.endswith(".TableEntity")
1973
1974        if is_file:
1975            # Get file metadata
1976            metadata = self.getFileAnnotations(manifest_synapse_id)
1977
1978            # If there is a defined component add it to the metadata.
1979            if "Component" in manifest.columns:
1980                # Gather component information
1981                component = manifest["Component"].unique()
1982
1983                # Double check that only a single component is listed, else raise an error.
1984                try:
1985                    len(component) == 1
1986                except ValueError as err:
1987                    raise ValueError(
1988                        f"Manifest has more than one component. Please check manifest and resubmit."
1989                    ) from err
1990
1991                # Add component to metadata
1992                metadata["Component"] = component[0]
1993
1994        elif is_table:
1995            # Get table metadata
1996            metadata = self.getTableAnnotations(manifest_synapse_id)
1997
1998        # Get annotations
1999        annos = OldAnnotations(
2000            id=entity.id, etag=entity.etag, values=entity.annotations
2001        )
2002
2003        # Add metadata to the annotations
2004        for annos_k, annos_v in metadata.items():
2005            annos[annos_k] = annos_v
2006        return annos
2007
2008    '''
2009    def annotate_upload_manifest_table(self, manifest, datasetId, metadataManifestPath,
2010        useSchemaLabel: bool = True, hideBlanks: bool = False, restrict_manifest = False):
2011        """
2012        Purpose:
2013            Works very similarly to associateMetadataWithFiles except takes in the manifest
2014            rather than the manifest path
2015
2016        """
2017
2018        # Add uuid for table updates and fill.
2019        if not "Uuid" in manifest.columns:
2020            manifest["Uuid"] = ''
2021
2022        for idx,row in manifest.iterrows():
2023            if not row["Uuid"]:
2024                gen_uuid = uuid.uuid4()
2025                row["Uuid"] = gen_uuid
2026                manifest.loc[idx, 'Uuid'] = gen_uuid
2027
2028        # add entityId as a column if not already there or
2029        # fill any blanks with an empty string.
2030        if not "entityId" in manifest.columns:
2031            manifest["entityId"] = ""
2032        else:
2033            manifest["entityId"].fillna("", inplace=True)
2034
2035        # get a DataModelGraphExplorer object to ensure schema attribute names used in manifest are translated to schema labels for synapse annotations
2036        dmge = DataModelGraphExplorer()
2037
2038        # Create table name here.
2039        if 'Component' in manifest.columns:
2040            table_name = manifest['Component'][0].lower() + '_synapse_storage_manifest_table'
2041        else:
2042            table_name = 'synapse_storage_manifest_table'
2043
2044        # Upload manifest as a table and get the SynID and manifest
2045        manifest_synapse_table_id, manifest, table_manifest = self.upload_format_manifest_table(
2046                                                    dmge, manifest, datasetId, table_name, restrict = restrict_manifest, useSchemaLabel=useSchemaLabel,)
2047
2048        # Iterate over manifest rows, create Synapse entities and store corresponding entity IDs in manifest if needed
2049        # also set metadata for each synapse entity as Synapse annotations
2050        for idx, row in manifest.iterrows():
2051            if not row["entityId"]:
2052                # If not using entityIds, fill with manifest_table_id so
2053                row["entityId"] = manifest_synapse_table_id
2054                entityId = ''
2055            else:
2056                # get the entity id corresponding to this row
2057                entityId = row["entityId"]
2058
2059        # Load manifest to synapse as a CSV File
2060        manifest_synapse_file_id = self.upload_manifest_file(manifest, metadataManifestPath, datasetId, restrict_manifest)
2061
2062        # Get annotations for the file manifest.
2063        manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_file_id)
2064
2065        self.syn.set_annotations(manifest_annotations)
2066
2067        logger.info("Associated manifest file with dataset on Synapse.")
2068
2069        # Update manifest Synapse table with new entity id column.
2070        self.make_synapse_table(
2071            table_to_load = table_manifest,
2072            dataset_id = datasetId,
2073            existingTableId = manifest_synapse_table_id,
2074            table_name = table_name,
2075            update_col = 'Uuid',
2076            specify_schema = False,
2077            )
2078
2079        # Get annotations for the table manifest
2080        manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_table_id)
2081        self.syn.set_annotations(manifest_annotations)
2082        return manifest_synapse_table_id
2083    '''
2084
2085    def _read_manifest(self, metadataManifestPath: str) -> pd.DataFrame:
2086        """Helper function to read in provided manifest as a pandas DataFrame for subsequent downstream processing.
2087        Args:
2088            metadataManifestPath (str): path where manifest is stored
2089        Returns:
2090            manifest(pd.DataFrame): Manifest loaded as a pandas dataframe
2091        Raises:
2092            FileNotFoundError: Manifest file does not exist at provided path.
2093        """
2094        # read new manifest csv
2095        try:
2096            load_args = {
2097                "dtype": "string",
2098            }
2099            manifest = load_df(
2100                metadataManifestPath,
2101                preserve_raw_input=False,
2102                allow_na_values=False,
2103                **load_args,
2104            )
2105        except FileNotFoundError as err:
2106            raise FileNotFoundError(
2107                f"No manifest file was found at this path: {metadataManifestPath}"
2108            ) from err
2109        return manifest
2110
2111    def _add_id_columns_to_manifest(
2112        self, manifest: pd.DataFrame, dmge: DataModelGraphExplorer
2113    ) -> pd.DataFrame:
2114        """
2115        Ensures that the manifest DataFrame has standardized 'Id' and 'entityId' columns.
2116
2117        - If any case variation of the 'id' column is present (e.g., 'id', 'ID', 'iD'), it is renamed to 'Id'.
2118        - If any case variation of the 'entityid' column is present, it is renamed to 'entityId'.
2119        - If any case variation of the 'uuid' column is present, it is renamed to 'uuid' before further processing.
2120        - If 'Id' is still missing:
2121            - It will be created as an empty column, or
2122            - Derived from a 'Uuid' column, depending on whether 'uuid' is defined in the schema.
2123        - If both 'uuid' and 'Id' columns exist, the 'uuid' column is dropped.
2124        - Missing values in the 'Id' column are filled with generated UUIDs.
2125        - If 'entityId' is still missing, it will be created and filled with empty strings.
2126        - If 'entityId' is already present, any missing values will be replaced with empty strings.
2127
2128        Args:
2129            manifest (pd.DataFrame): The metadata manifest to be updated.
2130            dmge (DataModelGraphExplorer): Data model graph explorer object.
2131
2132        Returns:
2133            pd.DataFrame: The updated manifest with a standardized 'Id' column and an 'entityId' column.
2134        """
2135
2136        # Normalize any variation of 'id' to 'Id', "entityid" to "entityId", "Uuid" to "uuid"
2137        for col in manifest.columns:
2138            if col.lower() == "id":
2139                manifest = manifest.rename(columns={col: ID_COLUMN})
2140            if col.lower() == "entityid":
2141                manifest = manifest.rename(columns={col: ENTITY_ID_COLUMN})
2142            if col.lower() == "uuid":
2143                manifest = manifest.rename(columns={col: UUID_COLUMN})
2144
2145        # If 'Id' still doesn't exist, see if uuid column exists
2146        # Rename uuid column to "Id" column
2147        if ID_COLUMN not in manifest.columns:
2148            # See if schema has `Uuid` column specified
2149            try:
2150                uuid_col_in_schema = dmge.is_class_in_schema(
2151                    "Uuid"
2152                ) or dmge.is_class_in_schema("uuid")
2153            except KeyError:
2154                uuid_col_in_schema = False
2155
2156            # Rename `uuid` column if it wasn't specified in the schema
2157            if UUID_COLUMN in manifest.columns and not uuid_col_in_schema:
2158                manifest = manifest.rename(columns={UUID_COLUMN: ID_COLUMN})
2159            # If no `uuid` column exists or it is specified in the schema, create a new `Id` column
2160            else:
2161                manifest[ID_COLUMN] = ""
2162        else:
2163            # 'Id' already exists, ignore 'uuid'
2164            if UUID_COLUMN in manifest.columns:
2165                manifest = manifest.drop(columns=[UUID_COLUMN])
2166
2167        # Fill in UUIDs in the "Id" column if missing
2168        for idx, row in manifest.iterrows():
2169            if not row["Id"]:
2170                gen_uuid = str(uuid.uuid4())
2171                row["Id"] = gen_uuid
2172                manifest.loc[idx, ID_COLUMN] = gen_uuid
2173
2174        # Add entityId as a column if not already there
2175        if ENTITY_ID_COLUMN not in manifest:
2176            manifest[ENTITY_ID_COLUMN] = ""
2177        else:
2178            manifest[ENTITY_ID_COLUMN] = manifest[ENTITY_ID_COLUMN].fillna("")
2179
2180        return manifest
2181
2182    def _generate_table_name(self, manifest):
2183        """Helper function to generate a table name for upload to synapse.
2184
2185        Args:
2186            Manifest loaded as a pd.Dataframe
2187
2188        Returns:
2189            table_name (str): Name of the table to load
2190            component_name (str): Name of the manifest component (if applicable)
2191        """
2192        # Create table name here.
2193        if "Component" in manifest.columns:
2194            component_name = manifest["Component"][0].lower()
2195            table_name = component_name + "_synapse_storage_manifest_table"
2196        else:
2197            component_name = ""
2198            table_name = "synapse_storage_manifest_table"
2199        return table_name, component_name
2200
2201    def _create_entity_id(self, idx, row, manifest, datasetId):
2202        """Helper function to generate an entityId and add it to the appropriate row in the manifest.
2203        Args:
2204            row: current row of manifest being processed
2205            manifest (pd.DataFrame): loaded df containing user supplied data.
2206            datasetId (str): synapse ID of folder containing the dataset
2207
2208        Returns:
2209            manifest (pd.DataFrame): manifest with entityId added to the appropriate row
2210            entityId (str): Generated Entity Id.
2211
2212        """
2213        rowEntity = Folder(str(uuid.uuid4()), parent=datasetId)
2214        rowEntity = self.syn.store(rowEntity)
2215        entityId = rowEntity["id"]
2216        self.synapse_entity_tracker.add(synapse_id=entityId, entity=rowEntity)
2217        row["entityId"] = entityId
2218        manifest.loc[idx, "entityId"] = entityId
2219        return manifest, entityId
2220
2221    async def _process_store_annos(self, requests: Set[asyncio.Task]) -> None:
2222        """Process annotations and store them on synapse asynchronously
2223
2224        Args:
2225            requests (Set[asyncio.Task]): a set of tasks of formatting annotations created by format_row_annotations function in previous step
2226
2227        Raises:
2228            RuntimeError: raise a run time error if a task failed to complete
2229        """
2230        while requests:
2231            done_tasks, pending_tasks = await asyncio.wait(
2232                requests, return_when=asyncio.FIRST_COMPLETED
2233            )
2234            requests = pending_tasks
2235
2236            for completed_task in done_tasks:
2237                try:
2238                    annos = completed_task.result()
2239
2240                    if isinstance(annos, Annotations):
2241                        logger.info(f"Successfully stored annotations for {annos.id}")
2242                    else:
2243                        # store annotations if they are not None
2244                        if annos:
2245                            entity_id = annos["annotations"]["id"]
2246                            logger.info(
2247                                f"Obtained and processed annotations for {entity_id} entity"
2248                            )
2249                            requests.add(
2250                                asyncio.create_task(
2251                                    self.store_async_annotation(annotation_dict=annos)
2252                                )
2253                            )
2254                except Exception as e:
2255                    raise RuntimeError(f"failed with { repr(e) }.") from e
2256
2257    @tracer.start_as_current_span("SynapseStorage::add_annotations_to_entities_files")
2258    async def add_annotations_to_entities_files(
2259        self,
2260        dmge,
2261        manifest,
2262        manifest_record_type: str,
2263        datasetId: str,
2264        hideBlanks: bool,
2265        manifest_synapse_table_id="",
2266        annotation_keys: str = "class_label",
2267    ):
2268        """
2269        Depending on upload type add Ids to entityId row. Add anotations to connected
2270        files and folders. Despite the name of this function, it also applies to folders.
2271
2272        Args:
2273            dmge: DataModelGraphExplorer Object
2274            manifest (pd.DataFrame): loaded df containing user supplied data.
2275            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2276            datasetId (str): synapse ID of folder containing the dataset
2277            hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2278            manifest_synapse_table_id (str): Default is an empty string ''.
2279            annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display
2280                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2281                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2282        Returns:
2283            manifest (pd.DataFrame): modified to add entitiyId as appropriate
2284
2285        """
2286
2287        # Expected behavior is to annotate files if `Filename` is present and if file_annotations_upload is set to True regardless of `-mrt` setting
2288        if "filename" in [col.lower() for col in manifest.columns]:
2289            # get current list of files and store as dataframe
2290            dataset_files = self.getFilesInStorageDataset(datasetId)
2291            files_and_entityIds = self._get_file_entityIds(
2292                dataset_files=dataset_files, only_new_files=False
2293            )
2294            file_df = pd.DataFrame(files_and_entityIds)
2295
2296            # Merge dataframes to add entityIds
2297            manifest = manifest.merge(
2298                file_df, how="left", on="Filename", suffixes=["_x", None]
2299            ).drop("entityId_x", axis=1)
2300
2301        # Fill `entityId` for each row if missing and annotate entity as appropriate
2302        requests = set()
2303        for idx, row in manifest.iterrows():
2304            if not row["entityId"] and (
2305                manifest_record_type == "file_and_entities"
2306                or manifest_record_type == "table_file_and_entities"
2307            ):
2308                manifest, entityId = self._create_entity_id(
2309                    idx, row, manifest, datasetId
2310                )
2311            elif not row["entityId"] and manifest_record_type == "table_and_file":
2312                # If not using entityIds, fill with manifest_table_id so
2313                row["entityId"] = manifest_synapse_table_id
2314                manifest.loc[idx, "entityId"] = manifest_synapse_table_id
2315                entityId = ""
2316                # If the row is the manifest table, do not add annotations
2317            elif row["entityId"] == manifest_synapse_table_id:
2318                entityId = ""
2319            else:
2320                # get the file id of the file to annotate, collected in above step.
2321                entityId = row["entityId"]
2322
2323            # Adding annotations to connected files.
2324            if entityId:
2325                # Format annotations for Synapse
2326                annos_task = asyncio.create_task(
2327                    self.format_row_annotations(
2328                        dmge, row, entityId, hideBlanks, annotation_keys
2329                    )
2330                )
2331                requests.add(annos_task)
2332        await self._process_store_annos(requests)
2333        return manifest
2334
2335    @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_table")
2336    def upload_manifest_as_table(
2337        self,
2338        dmge: DataModelGraphExplorer,
2339        manifest: pd.DataFrame,
2340        metadataManifestPath: str,
2341        datasetId: str,
2342        table_name: str,
2343        component_name: str,
2344        restrict: bool,
2345        manifest_record_type: str,
2346        hideBlanks: bool,
2347        table_manipulation: str,
2348        table_column_names: str,
2349        annotation_keys: str,
2350        file_annotations_upload: bool = True,
2351    ):
2352        """Upload manifest to Synapse as a table and csv.
2353        Args:
2354            dmge: DataModelGraphExplorer object
2355            manifest (pd.DataFrame): loaded df containing user supplied data.
2356            metadataManifestPath: path to csv containing a validated metadata manifest.
2357            datasetId (str): synapse ID of folder containing the dataset
2358            table_name (str): Generated to name the table being uploaded.
2359            component_name (str): Name of the component manifest that is currently being uploaded.
2360            restrict (bool): Flag for censored data.
2361            manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2362            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2363            table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2364            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2365                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2366                display label formatting.
2367            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2368                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2369                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2370            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2371        Return:
2372            manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2373        """
2374        # Upload manifest as a table, get the ID and updated manifest.
2375        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2376            dmge=dmge,
2377            manifest=manifest,
2378            datasetId=datasetId,
2379            table_name=table_name,
2380            restrict=restrict,
2381            table_manipulation=table_manipulation,
2382            table_column_names=table_column_names,
2383        )
2384
2385        if file_annotations_upload:
2386            manifest = asyncio.run(
2387                self.add_annotations_to_entities_files(
2388                    dmge,
2389                    manifest,
2390                    manifest_record_type,
2391                    datasetId,
2392                    hideBlanks,
2393                    manifest_synapse_table_id,
2394                    annotation_keys,
2395                )
2396            )
2397        # Load manifest to synapse as a CSV File
2398        manifest_synapse_file_id = self.upload_manifest_file(
2399            manifest=manifest,
2400            metadataManifestPath=metadataManifestPath,
2401            datasetId=datasetId,
2402            restrict_manifest=restrict,
2403            component_name=component_name,
2404        )
2405
2406        # Set annotations for the file manifest.
2407        manifest_annotations = self.format_manifest_annotations(
2408            manifest=manifest, manifest_synapse_id=manifest_synapse_file_id
2409        )
2410        annos = self.syn.set_annotations(annotations=manifest_annotations)
2411        manifest_entity = self.synapse_entity_tracker.get(
2412            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2413        )
2414        manifest_entity.annotations = annos
2415        manifest_entity.etag = annos.etag
2416
2417        logger.info("Associated manifest file with dataset on Synapse.")
2418
2419        # Update manifest Synapse table with new entity id column.
2420        manifest_synapse_table_id, manifest, _ = self.uploadDB(
2421            dmge=dmge,
2422            manifest=manifest,
2423            datasetId=datasetId,
2424            table_name=table_name,
2425            restrict=restrict,
2426            table_manipulation="update",
2427            table_column_names=table_column_names,
2428        )
2429
2430        # Set annotations for the table manifest
2431        manifest_annotations = self.format_manifest_annotations(
2432            manifest=manifest, manifest_synapse_id=manifest_synapse_table_id
2433        )
2434        annotations_manifest_table = self.syn.set_annotations(
2435            annotations=manifest_annotations
2436        )
2437        manifest_table_entity = self.synapse_entity_tracker.get(
2438            synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False
2439        )
2440        manifest_table_entity.annotations = annotations_manifest_table
2441        manifest_table_entity.etag = annotations_manifest_table.etag
2442
2443        return manifest_synapse_file_id
2444
2445    @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_csv")
2446    def upload_manifest_as_csv(
2447        self,
2448        dmge,
2449        manifest,
2450        metadataManifestPath,
2451        datasetId,
2452        restrict,
2453        manifest_record_type,
2454        hideBlanks,
2455        component_name,
2456        annotation_keys: str,
2457        file_annotations_upload: bool = True,
2458    ):
2459        """Upload manifest to Synapse as a csv only.
2460        Args:
2461            dmge: DataModelGraphExplorer object
2462            manifest (pd.DataFrame): loaded df containing user supplied data.
2463            metadataManifestPath: path to csv containing a validated metadata manifest.
2464            datasetId (str): synapse ID of folder containing the dataset
2465            restrict (bool): Flag for censored data.
2466            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2467            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2468            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2469                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2470                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2471            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2472        Return:
2473            manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2474        """
2475        if file_annotations_upload:
2476            manifest = asyncio.run(
2477                self.add_annotations_to_entities_files(
2478                    dmge,
2479                    manifest,
2480                    manifest_record_type,
2481                    datasetId,
2482                    hideBlanks,
2483                    annotation_keys=annotation_keys,
2484                )
2485            )
2486
2487        # Load manifest to synapse as a CSV File
2488        manifest_synapse_file_id = self.upload_manifest_file(
2489            manifest,
2490            metadataManifestPath,
2491            datasetId,
2492            restrict,
2493            component_name=component_name,
2494        )
2495
2496        # Set annotations for the file manifest.
2497        manifest_annotations = self.format_manifest_annotations(
2498            manifest, manifest_synapse_file_id
2499        )
2500        annos = self.syn.set_annotations(manifest_annotations)
2501        manifest_entity = self.synapse_entity_tracker.get(
2502            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2503        )
2504        manifest_entity.annotations = annos
2505        manifest_entity.etag = annos.etag
2506
2507        logger.info("Associated manifest file with dataset on Synapse.")
2508
2509        return manifest_synapse_file_id
2510
2511    @tracer.start_as_current_span("SynapseStorage::upload_manifest_combo")
2512    def upload_manifest_combo(
2513        self,
2514        dmge,
2515        manifest,
2516        metadataManifestPath,
2517        datasetId,
2518        table_name,
2519        component_name,
2520        restrict,
2521        manifest_record_type,
2522        hideBlanks,
2523        table_manipulation,
2524        table_column_names: str,
2525        annotation_keys: str,
2526        file_annotations_upload: bool = True,
2527    ):
2528        """Upload manifest to Synapse as a table and CSV with entities.
2529        Args:
2530            dmge: DataModelGraphExplorer object
2531            manifest (pd.DataFrame): loaded df containing user supplied data.
2532            metadataManifestPath: path to csv containing a validated metadata manifest.
2533            datasetId (str): synapse ID of folder containing the dataset
2534            table_name (str): Generated to name the table being uploaded.
2535            component_name (str): Name of the component manifest that is currently being uploaded.
2536            restrict (bool): Flag for censored data.
2537            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2538            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2539            table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2540            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2541                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2542                display label formatting.
2543            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2544                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2545                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2546            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2547        Return:
2548            manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2549        """
2550        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2551            dmge=dmge,
2552            manifest=manifest,
2553            datasetId=datasetId,
2554            table_name=table_name,
2555            restrict=restrict,
2556            table_manipulation=table_manipulation,
2557            table_column_names=table_column_names,
2558        )
2559
2560        if file_annotations_upload:
2561            manifest = asyncio.run(
2562                self.add_annotations_to_entities_files(
2563                    dmge,
2564                    manifest,
2565                    manifest_record_type,
2566                    datasetId,
2567                    hideBlanks,
2568                    manifest_synapse_table_id,
2569                    annotation_keys=annotation_keys,
2570                )
2571            )
2572
2573        # Load manifest to synapse as a CSV File
2574        manifest_synapse_file_id = self.upload_manifest_file(
2575            manifest, metadataManifestPath, datasetId, restrict, component_name
2576        )
2577
2578        # Set annotations for the file manifest.
2579        manifest_annotations = self.format_manifest_annotations(
2580            manifest, manifest_synapse_file_id
2581        )
2582        file_manifest_annoations = self.syn.set_annotations(manifest_annotations)
2583        manifest_entity = self.synapse_entity_tracker.get(
2584            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2585        )
2586        manifest_entity.annotations = file_manifest_annoations
2587        manifest_entity.etag = file_manifest_annoations.etag
2588        logger.info("Associated manifest file with dataset on Synapse.")
2589
2590        # Update manifest Synapse table with new entity id column.
2591        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2592            dmge=dmge,
2593            manifest=manifest,
2594            datasetId=datasetId,
2595            table_name=table_name,
2596            restrict=restrict,
2597            table_manipulation="update",
2598            table_column_names=table_column_names,
2599        )
2600
2601        # Set annotations for the table manifest
2602        manifest_annotations = self.format_manifest_annotations(
2603            manifest, manifest_synapse_table_id
2604        )
2605        table_manifest_annotations = self.syn.set_annotations(manifest_annotations)
2606        manifest_entity = self.synapse_entity_tracker.get(
2607            synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False
2608        )
2609        manifest_entity.annotations = table_manifest_annotations
2610        manifest_entity.etag = table_manifest_annotations.etag
2611        return manifest_synapse_file_id
2612
2613    @tracer.start_as_current_span("SynapseStorage::associateMetadataWithFiles")
2614    def associateMetadataWithFiles(
2615        self,
2616        dmge: DataModelGraphExplorer,
2617        metadataManifestPath: str,
2618        datasetId: str,
2619        manifest_record_type: str = "table_file_and_entities",
2620        hideBlanks: bool = False,
2621        restrict_manifest=False,
2622        table_manipulation: str = "replace",
2623        table_column_names: str = "class_label",
2624        annotation_keys: str = "class_label",
2625        file_annotations_upload: bool = True,
2626    ) -> str:
2627        """Associate metadata with files in a storage dataset already on Synapse.
2628        Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file.
2629
2630        If this is a new manifest there could be no Synapse entities associated with the rows of this manifest
2631        this may be due to data type (e.g. clinical data) being tabular
2632        and not requiring files; to utilize uniform interfaces downstream
2633        (i.e. fileviews), a Synapse entity (a folder) is created for each row
2634        and an entity column is added to the manifest containing the resulting
2635        entity IDs; a table is also created at present as an additional interface
2636        for downstream query and interaction with the data.
2637
2638        Args:
2639            dmge: DataModelGraphExplorer Object
2640            metadataManifestPath: path to csv containing a validated metadata manifest.
2641            The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type.
2642            Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item.
2643            In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file.
2644            datasetId: synapse ID of folder containing the dataset
2645            manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination.
2646            hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2647            restrict_manifest (bool): Default is false. Flag for censored data.
2648            table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2649            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2650                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2651                display label formatting.
2652            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2653                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2654                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2655        Returns:
2656            manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2657        """
2658        # Read new manifest CSV:
2659        manifest = self._read_manifest(metadataManifestPath)
2660        manifest = self._add_id_columns_to_manifest(manifest, dmge)
2661
2662        table_name, component_name = self._generate_table_name(manifest)
2663
2664        # Upload manifest to synapse based on user input (manifest_record_type)
2665        if manifest_record_type == "file_only":
2666            manifest_synapse_file_id = self.upload_manifest_as_csv(
2667                dmge=dmge,
2668                manifest=manifest,
2669                metadataManifestPath=metadataManifestPath,
2670                datasetId=datasetId,
2671                restrict=restrict_manifest,
2672                hideBlanks=hideBlanks,
2673                manifest_record_type=manifest_record_type,
2674                component_name=component_name,
2675                annotation_keys=annotation_keys,
2676                file_annotations_upload=file_annotations_upload,
2677            )
2678        elif manifest_record_type == "table_and_file":
2679            manifest_synapse_file_id = self.upload_manifest_as_table(
2680                dmge=dmge,
2681                manifest=manifest,
2682                metadataManifestPath=metadataManifestPath,
2683                datasetId=datasetId,
2684                table_name=table_name,
2685                component_name=component_name,
2686                restrict=restrict_manifest,
2687                hideBlanks=hideBlanks,
2688                manifest_record_type=manifest_record_type,
2689                table_manipulation=table_manipulation,
2690                table_column_names=table_column_names,
2691                annotation_keys=annotation_keys,
2692                file_annotations_upload=file_annotations_upload,
2693            )
2694        elif manifest_record_type == "file_and_entities":
2695            manifest_synapse_file_id = self.upload_manifest_as_csv(
2696                dmge=dmge,
2697                manifest=manifest,
2698                metadataManifestPath=metadataManifestPath,
2699                datasetId=datasetId,
2700                restrict=restrict_manifest,
2701                hideBlanks=hideBlanks,
2702                manifest_record_type=manifest_record_type,
2703                component_name=component_name,
2704                annotation_keys=annotation_keys,
2705                file_annotations_upload=file_annotations_upload,
2706            )
2707        elif manifest_record_type == "table_file_and_entities":
2708            manifest_synapse_file_id = self.upload_manifest_combo(
2709                dmge=dmge,
2710                manifest=manifest,
2711                metadataManifestPath=metadataManifestPath,
2712                datasetId=datasetId,
2713                table_name=table_name,
2714                component_name=component_name,
2715                restrict=restrict_manifest,
2716                hideBlanks=hideBlanks,
2717                manifest_record_type=manifest_record_type,
2718                table_manipulation=table_manipulation,
2719                table_column_names=table_column_names,
2720                annotation_keys=annotation_keys,
2721                file_annotations_upload=file_annotations_upload,
2722            )
2723        else:
2724            raise ValueError("Please enter a valid manifest_record_type.")
2725        return manifest_synapse_file_id
2726
2727    def getTableAnnotations(self, table_id: str):
2728        """Generate dictionary of annotations for the given Synapse file.
2729        Synapse returns all custom annotations as lists since they
2730        can contain multiple values. In all cases, the values will
2731        be converted into strings and concatenated with ", ".
2732
2733        Args:
2734            fileId (str): Synapse ID for dataset file.
2735
2736        Returns:
2737            dict: Annotations as comma-separated strings.
2738        """
2739        try:
2740            entity = self.synapse_entity_tracker.get(
2741                synapse_id=table_id, syn=self.syn, download_file=False
2742            )
2743            is_table = entity.concreteType.endswith(".TableEntity")
2744            annotations_raw = entity.annotations
2745        except SynapseHTTPError:
2746            # If an error occurs with retrieving entity, skip it
2747            # This could be caused by a temporary file view that
2748            # was deleted since its ID was retrieved
2749            is_file, is_table = False, False
2750
2751        # Skip anything that isn't a file or folder
2752        if not (is_table):
2753            return None
2754
2755        annotations = self.getEntityAnnotations(table_id, entity, annotations_raw)
2756
2757        return annotations
2758
2759    def getFileAnnotations(self, fileId: str) -> Dict[str, str]:
2760        """Generate dictionary of annotations for the given Synapse file.
2761        Synapse returns all custom annotations as lists since they
2762        can contain multiple values. In all cases, the values will
2763        be converted into strings and concatenated with ", ".
2764
2765        Args:
2766            fileId (str): Synapse ID for dataset file.
2767
2768        Returns:
2769            dict: Annotations as comma-separated strings.
2770        """
2771
2772        # Get entity metadata, including annotations
2773        try:
2774            entity = self.synapse_entity_tracker.get(
2775                synapse_id=fileId, syn=self.syn, download_file=False
2776            )
2777            is_file = entity.concreteType.endswith(".FileEntity")
2778            is_folder = entity.concreteType.endswith(".Folder")
2779            annotations_raw = entity.annotations
2780        except SynapseHTTPError:
2781            # If an error occurs with retrieving entity, skip it
2782            # This could be caused by a temporary file view that
2783            # was deleted since its ID was retrieved
2784            is_file, is_folder = False, False
2785
2786        # Skip anything that isn't a file or folder
2787        if not (is_file or is_folder):
2788            return None
2789
2790        annotations = self.getEntityAnnotations(fileId, entity, annotations_raw)
2791
2792        return annotations
2793
2794    def getEntityAnnotations(self, fileId, entity, annotations_raw):
2795        # Extract annotations from their lists and stringify. For example:
2796        # {'YearofBirth': [1980], 'author': ['bruno', 'milen', 'sujay']}
2797        annotations = dict()
2798        for key, vals in annotations_raw.items():
2799            if isinstance(vals, list) and len(vals) == 1:
2800                annotations[key] = str(vals[0])
2801            else:
2802                annotations[key] = ", ".join(str(v) for v in vals)
2803
2804        # Add the file entity ID and eTag, which weren't lists
2805        assert fileId == entity.id, (
2806            "For some reason, the Synapse ID in the response doesn't match"
2807            "the Synapse ID sent in the request (via synapseclient)."
2808        )
2809        annotations["entityId"] = fileId
2810        annotations["eTag"] = entity.etag
2811
2812        return annotations
2813
2814    def getDatasetAnnotations(
2815        self, datasetId: str, fill_na: bool = True, force_batch: bool = False
2816    ) -> pd.DataFrame:
2817        """Generate table for annotations across all files in given dataset.
2818
2819        Args:
2820            datasetId (str): Synapse ID for dataset folder.
2821            fill_na (bool): Whether to replace missing values with
2822                blank strings.
2823            force_batch (bool): Whether to force the function to use
2824                the batch mode, which uses a file view to retrieve
2825                annotations for a given dataset. Default to False
2826                unless there are more than 50 files in the dataset.
2827
2828        Returns:
2829            pd.DataFrame: Table of annotations.
2830        """
2831        # Get all files in given dataset
2832        dataset_files = self.getFilesInStorageDataset(datasetId)
2833
2834        # if there are no dataset files, there are no annotations
2835        # return None
2836        if not dataset_files:
2837            return pd.DataFrame()
2838
2839        dataset_files_map = dict(dataset_files)
2840        dataset_file_ids, _ = list(zip(*dataset_files))
2841
2842        # Get annotations for each file from Step 1
2843        # Batch mode
2844        try_batch = len(dataset_files) >= 50 or force_batch
2845        if try_batch:
2846            try:
2847                logger.info("Trying batch mode for retrieving Synapse annotations")
2848                table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids)
2849            except (SynapseAuthenticationError, SynapseHTTPError, ValueError):
2850                logger.info(
2851                    f"Unable to create a temporary file view bound to {datasetId}. "
2852                    "Defaulting to slower iterative retrieval of annotations."
2853                )
2854                # Default to the slower non-batch method
2855                logger.info("Batch mode failed (probably due to permission error)")
2856                try_batch = False
2857
2858        # Non-batch mode
2859        if not try_batch:
2860            logger.info("Using slower (non-batch) sequential mode")
2861            records = [self.getFileAnnotations(i) for i in dataset_file_ids]
2862            # Remove any annotations for non-file/folders (stored as None)
2863            records = filter(None, records)
2864            table = pd.DataFrame.from_records(records)
2865
2866        # Add filenames for the files that "survived" annotation retrieval
2867        filenames = [dataset_files_map[i] for i in table["entityId"]]
2868
2869        if "Filename" not in table.columns:
2870            table.insert(0, "Filename", filenames)
2871
2872        # Ensure that entityId and eTag are at the end
2873        entity_ids = table.pop("entityId")
2874        etags = table.pop("eTag")
2875        table.insert(len(table.columns), "entityId", entity_ids)
2876        table.insert(len(table.columns), "eTag", etags)
2877
2878        # Missing values are filled in with empty strings for Google Sheets
2879        if fill_na:
2880            table.fillna("", inplace=True)
2881
2882        # Force all values as strings
2883        return table.astype(str)
2884
2885    def raise_final_error(retry_state):
2886        return retry_state.outcome.result()
2887
2888    def checkIfinAssetView(self, syn_id) -> str:
2889        # get data in administrative fileview for this pipeline
2890        assetViewTable = self.getStorageFileviewTable()
2891        all_files = list(assetViewTable["id"])
2892        if syn_id in all_files:
2893            return True
2894        else:
2895            return False
2896
2897    @tracer.start_as_current_span("SynapseStorage::getDatasetProject")
2898    @retry(
2899        stop=stop_after_attempt(5),
2900        wait=wait_chain(
2901            *[wait_fixed(10) for i in range(2)]
2902            + [wait_fixed(15) for i in range(2)]
2903            + [wait_fixed(20)]
2904        ),
2905        retry=retry_if_exception_type(LookupError),
2906        retry_error_callback=raise_final_error,
2907    )
2908    def getDatasetProject(self, datasetId: str) -> str:
2909        """Get parent project for a given dataset ID.
2910
2911        Args:
2912            datasetId (str): Synapse entity ID (folder or project).
2913
2914        Raises:
2915            ValueError: Raised if Synapse ID cannot be retrieved
2916            by the user or if it doesn't appear in the file view.
2917
2918        Returns:
2919            str: The Synapse ID for the parent project.
2920        """
2921
2922        # Subset main file view
2923        dataset_index = self.storageFileviewTable["id"] == datasetId
2924        dataset_row = self.storageFileviewTable[dataset_index]
2925
2926        # re-query if no datasets found
2927        if dataset_row.empty:
2928            sleep(5)
2929            self.query_fileview(force_requery=True)
2930            # Subset main file view
2931            dataset_index = self.storageFileviewTable["id"] == datasetId
2932            dataset_row = self.storageFileviewTable[dataset_index]
2933
2934        # Return `projectId` for given row if only one found
2935        if len(dataset_row) == 1:
2936            dataset_project = dataset_row["projectId"].values[0]
2937            return dataset_project
2938
2939        # Otherwise, check if already project itself
2940        try:
2941            syn_object = self.synapse_entity_tracker.get(
2942                synapse_id=datasetId, syn=self.syn, download_file=False
2943            )
2944            if syn_object.properties["concreteType"].endswith("Project"):
2945                return datasetId
2946        except SynapseHTTPError:
2947            raise PermissionError(
2948                f"The given dataset ({datasetId}) isn't accessible with this "
2949                "user. This might be caused by a typo in the dataset Synapse ID."
2950            )
2951
2952        # If not, then assume dataset not in file view
2953        raise LookupError(
2954            f"The given dataset ({datasetId}) doesn't appear in the "
2955            f"configured file view ({self.storageFileview}). This might "
2956            "mean that the file view's scope needs to be updated."
2957        )
2958
2959    def getDatasetAnnotationsBatch(
2960        self, datasetId: str, dataset_file_ids: Sequence[str] = None
2961    ) -> pd.DataFrame:
2962        """Generate table for annotations across all files in given dataset.
2963        This function uses a temporary file view to generate a table
2964        instead of iteratively querying for individual entity annotations.
2965        This function is expected to run much faster than
2966        `self.getDatasetAnnotationsBatch` on large datasets.
2967
2968        Args:
2969            datasetId (str): Synapse ID for dataset folder.
2970            dataset_file_ids (Sequence[str]): List of Synapse IDs
2971                for dataset files/folders used to subset the table.
2972
2973        Returns:
2974            pd.DataFrame: Table of annotations.
2975        """
2976        # Create data frame from annotations file view
2977        with DatasetFileView(datasetId, self.syn) as fileview:
2978            table = fileview.query()
2979
2980        if dataset_file_ids:
2981            table = table.loc[table.index.intersection(dataset_file_ids)]
2982
2983        table = table.reset_index(drop=True)
2984
2985        return table
2986
2987    def _get_table_schema_by_cname(self, table_schema):
2988        # assume no duplicate column names in the table
2989        table_schema_by_cname = {}
2990
2991        for col_record in table_schema:
2992            # TODO clean up dictionary for compactness (e.g. remove redundant 'name' key)
2993            table_schema_by_cname[col_record["name"]] = col_record
2994
2995        return table_schema_by_cname
2996
2997
2998class TableOperations:
2999    """
3000    Object to hold functions for various table operations specific to the Synapse Asset Store.
3001
3002    Currently implement operations are:
3003    createTable: upload a manifest as a new table when none exist
3004    replaceTable: replace a metadata in a table from one manifest with metadata from another manifest
3005    updateTable: add a column to a table that already exists on synapse
3006
3007    Operations currently in development are:
3008    upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest
3009    """
3010
3011    def __init__(
3012        self,
3013        synStore: SynapseStorage,
3014        tableToLoad: pd.DataFrame = None,
3015        tableName: str = None,
3016        datasetId: str = None,
3017        existingTableId: str = None,
3018        restrict: bool = False,
3019        synapse_entity_tracker: SynapseEntityTracker = None,
3020    ):
3021        """
3022        Class governing table operations (creation, replacement, upserts, updates) in schematic
3023
3024        tableToLoad: manifest formatted appropriately for the table
3025        tableName: name of the table to be uploaded
3026        datasetId: synID of the dataset for the manifest
3027        existingTableId: synId of the table currently exising on synapse (if there is one)
3028        restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
3029        synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
3030
3031        """
3032        self.synStore = synStore
3033        self.tableToLoad = tableToLoad
3034        self.tableName = tableName
3035        self.datasetId = datasetId
3036        self.existingTableId = existingTableId
3037        self.restrict = restrict
3038        self.synapse_entity_tracker = synapse_entity_tracker or SynapseEntityTracker()
3039
3040    @tracer.start_as_current_span("TableOperations::createTable")
3041    def createTable(
3042        self,
3043        columnTypeDict: dict = None,
3044        specifySchema: bool = True,
3045    ):
3046        """
3047        Method to create a table from a metadata manifest and upload it to synapse
3048
3049        Args:
3050            columnTypeDict: dictionary schema for table columns: type, size, etc
3051            specifySchema: to specify a specific schema for the table format
3052
3053        Returns:
3054            table.schema.id: synID of the newly created table
3055        """
3056        datasetEntity = self.synapse_entity_tracker.get(
3057            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3058        )
3059        datasetName = datasetEntity.name
3060        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3061
3062        if not self.tableName:
3063            self.tableName = datasetName + "table"
3064        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3065        if specifySchema:
3066            if columnTypeDict == {}:
3067                logger.error("Did not provide a columnTypeDict.")
3068            # create list of columns:
3069            cols = []
3070            for col in self.tableToLoad.columns:
3071                if col in table_schema_by_cname:
3072                    col_type = table_schema_by_cname[col]["columnType"]
3073                    max_size = (
3074                        table_schema_by_cname[col]["maximumSize"]
3075                        if "maximumSize" in table_schema_by_cname[col].keys()
3076                        else 100
3077                    )
3078                    max_list_len = 250
3079                    if max_size and max_list_len:
3080                        cols.append(
3081                            Column(
3082                                name=col,
3083                                columnType=col_type,
3084                                maximumSize=max_size,
3085                                maximumListLength=max_list_len,
3086                            )
3087                        )
3088                    elif max_size:
3089                        cols.append(
3090                            Column(name=col, columnType=col_type, maximumSize=max_size)
3091                        )
3092                    else:
3093                        cols.append(Column(name=col, columnType=col_type))
3094                else:
3095                    # TODO add warning that the given col was not found and it's max size is set to 100
3096                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3097            schema = Schema(
3098                name=self.tableName, columns=cols, parent=datasetParentProject
3099            )
3100            table = Table(schema, self.tableToLoad)
3101            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3102            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3103            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3104            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3105            return table.schema.id
3106        else:
3107            # For just uploading the tables to synapse using default
3108            # column types.
3109            table = build_table(self.tableName, datasetParentProject, self.tableToLoad)
3110            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3111            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3112            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3113            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3114            return table.schema.id
3115
3116    @tracer.start_as_current_span("TableOperations::replaceTable")
3117    def replaceTable(
3118        self,
3119        specifySchema: bool = True,
3120        columnTypeDict: dict = None,
3121    ):
3122        """
3123        Method to replace an existing table on synapse with metadata from a new manifest
3124
3125        Args:
3126            specifySchema: to infer a schema for the table format
3127            columnTypeDict: dictionary schema for table columns: type, size, etc
3128
3129        Returns:
3130           existingTableId: synID of the already existing table that had its metadata replaced
3131        """
3132        datasetEntity = self.synapse_entity_tracker.get(
3133            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3134        )
3135
3136        datasetName = datasetEntity.name
3137        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3138        existing_table, existing_results = self.synStore.get_synapse_table(
3139            self.existingTableId
3140        )
3141        # remove rows
3142        self.synStore.syn.delete(existing_results)
3143        # Data changes such as removing all rows causes the eTag to change.
3144        self.synapse_entity_tracker.remove(synapse_id=self.existingTableId)
3145        # wait for row deletion to finish on synapse before getting empty table
3146        sleep(10)
3147
3148        # removes all current columns
3149        current_table = self.synapse_entity_tracker.get(
3150            synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False
3151        )
3152
3153        current_columns = self.synStore.syn.getTableColumns(current_table)
3154
3155        for col in current_columns:
3156            current_table.removeColumn(col)
3157
3158        if not self.tableName:
3159            self.tableName = datasetName + "table"
3160
3161        # Process columns according to manifest entries
3162        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3163        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3164        if specifySchema:
3165            if columnTypeDict == {}:
3166                logger.error("Did not provide a columnTypeDict.")
3167            # create list of columns:
3168            cols = []
3169
3170            for col in self.tableToLoad.columns:
3171                if col in table_schema_by_cname:
3172                    col_type = table_schema_by_cname[col]["columnType"]
3173                    max_size = (
3174                        table_schema_by_cname[col]["maximumSize"]
3175                        if "maximumSize" in table_schema_by_cname[col].keys()
3176                        else 100
3177                    )
3178                    max_list_len = 250
3179                    if max_size and max_list_len:
3180                        cols.append(
3181                            Column(
3182                                name=col,
3183                                columnType=col_type,
3184                                maximumSize=max_size,
3185                                maximumListLength=max_list_len,
3186                            )
3187                        )
3188                    elif max_size:
3189                        cols.append(
3190                            Column(name=col, columnType=col_type, maximumSize=max_size)
3191                        )
3192                    else:
3193                        cols.append(Column(name=col, columnType=col_type))
3194                else:
3195                    # TODO add warning that the given col was not found and it's max size is set to 100
3196                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3197
3198            # adds new columns to schema
3199            for col in cols:
3200                current_table.addColumn(col)
3201
3202            table_result = self.synStore.syn.store(
3203                current_table, isRestricted=self.restrict
3204            )
3205            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3206            # self.synapse_entity_tracker.add(synapse_id=table_result.schema.id, entity=table_result.schema)
3207            self.synapse_entity_tracker.remove(synapse_id=table_result.id)
3208
3209            # wait for synapse store to finish
3210            sleep(1)
3211
3212            # build schema and table from columns and store with necessary restrictions
3213            schema = Schema(
3214                name=self.tableName, columns=cols, parent=datasetParentProject
3215            )
3216            schema.id = self.existingTableId
3217            table = Table(schema, self.tableToLoad, etag=existing_results.etag)
3218            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3219            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3220            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3221            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3222        else:
3223            logging.error("Must specify a schema for table replacements")
3224
3225        # remove system metadata from manifest
3226        existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True)
3227        return self.existingTableId
3228
3229    @tracer.start_as_current_span("TableOperations::_get_auth_token")
3230    def _get_auth_token(
3231        self,
3232    ):
3233        authtoken = None
3234
3235        # Get access token from environment variable if available
3236        # Primarily useful for testing environments, with other possible usefulness for containers
3237        env_access_token = os.getenv("SYNAPSE_ACCESS_TOKEN")
3238        if env_access_token:
3239            authtoken = env_access_token
3240            return authtoken
3241
3242        # Get token from authorization header
3243        # Primarily useful for API endpoint functionality
3244        if "Authorization" in self.synStore.syn.default_headers:
3245            authtoken = self.synStore.syn.default_headers["Authorization"].split(
3246                "Bearer "
3247            )[-1]
3248            return authtoken
3249
3250        # retrive credentials from synapse object
3251        # Primarily useful for local users, could only be stored here when a .synapseConfig file is used, but including to be safe
3252        synapse_object_creds = self.synStore.syn.credentials
3253        if hasattr(synapse_object_creds, "_token"):
3254            authtoken = synapse_object_creds.secret
3255
3256        # Try getting creds from .synapseConfig file if it exists
3257        # Primarily useful for local users. Seems to correlate with credentials stored in synaspe object when logged in
3258        if os.path.exists(CONFIG.synapse_configuration_path):
3259            config = get_config_file(CONFIG.synapse_configuration_path)
3260
3261            # check which credentials are provided in file
3262            if config.has_option("authentication", "authtoken"):
3263                authtoken = config.get("authentication", "authtoken")
3264
3265        # raise error if required credentials are not found
3266        if not authtoken:
3267            raise NameError(
3268                "authtoken credentials could not be found in the environment, synapse object, or the .synapseConfig file"
3269            )
3270
3271        return authtoken
3272
3273    @tracer.start_as_current_span("TableOperations::upsertTable")
3274    def upsertTable(self, dmge: DataModelGraphExplorer):
3275        """
3276        Method to upsert rows from a new manifest into an existing table on synapse
3277        For upsert functionality to work, primary keys must follow the naming convention of <componenet>_id
3278        `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality.
3279        Currently it is required to use -tcn "display label" with table upserts.
3280
3281
3282        Args:
3283            dmge: DataModelGraphExplorer instance
3284
3285        Returns:
3286           existingTableId: synID of the already existing table that had its metadata replaced
3287        """
3288
3289        authtoken = self._get_auth_token()
3290
3291        synapseDB = SynapseDatabase(
3292            auth_token=authtoken,
3293            project_id=self.synStore.getDatasetProject(self.datasetId),
3294            syn=self.synStore.syn,
3295            synapse_entity_tracker=self.synapse_entity_tracker,
3296        )
3297
3298        try:
3299            # Try performing upsert
3300            synapseDB.upsert_table_rows(
3301                table_name=self.tableName, data=self.tableToLoad
3302            )
3303        except SynapseHTTPError as ex:
3304            # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload
3305            if "Id is not a valid column name or id" in str(ex):
3306                self._update_table_uuid_column(dmge)
3307                synapseDB.upsert_table_rows(
3308                    table_name=self.tableName, data=self.tableToLoad
3309                )
3310            # Raise if other error
3311            else:
3312                raise ex
3313
3314        return self.existingTableId
3315
3316    @tracer.start_as_current_span("TableOperations::_update_table_uuid_column")
3317    def _update_table_uuid_column(
3318        self,
3319        dmge: DataModelGraphExplorer,
3320    ) -> None:
3321        """Removes the `Uuid` column when present, and relpaces with an `Id` column
3322        Used to enable backwards compatability for manifests using the old `Uuid` convention
3323
3324        Args:
3325            dmge: DataModelGraphExplorer instance
3326
3327        Returns:
3328            None
3329        """
3330
3331        # Get the columns of the schema
3332        schema = self.synapse_entity_tracker.get(
3333            synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False
3334        )
3335
3336        cols = self.synStore.syn.getTableColumns(schema)
3337
3338        # Iterate through columns until `Uuid` column is found
3339        for col in cols:
3340            if col.name.lower() == "uuid":
3341                # See if schema has `Uuid` column specified
3342                try:
3343                    uuid_col_in_schema = dmge.is_class_in_schema(col.name)
3344                except KeyError:
3345                    uuid_col_in_schema = False
3346
3347                # If there is, then create a new `Id` column from scratch
3348                if uuid_col_in_schema:
3349                    new_col = Column(columnType="STRING", maximumSize=64, name="Id")
3350                    schema.addColumn(new_col)
3351                    schema = self.synStore.syn.store(schema)
3352                    # self.synapse_entity_tracker.add(synapse_id=schema.id, entity=schema)
3353                    # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3354                    self.synapse_entity_tracker.remove(synapse_id=schema.id)
3355                # If there is not, then use the old `Uuid` column as a basis for the new `Id` column
3356                else:
3357                    # Build ColumnModel that will be used for new column
3358                    id_column = Column(
3359                        name="Id",
3360                        columnType="STRING",
3361                        maximumSize=64,
3362                        defaultValue=None,
3363                        maximumListLength=1,
3364                    )
3365                    new_col_response = self.synStore.syn.store(id_column)
3366
3367                    # Define columnChange body
3368                    columnChangeDict = {
3369                        "concreteType": "org.sagebionetworks.repo.model.table.TableSchemaChangeRequest",
3370                        "entityId": self.existingTableId,
3371                        "changes": [
3372                            {
3373                                "oldColumnId": col["id"],
3374                                "newColumnId": new_col_response["id"],
3375                            }
3376                        ],
3377                    }
3378
3379                    self.synStore.syn._async_table_update(
3380                        table=self.existingTableId,
3381                        changes=[columnChangeDict],
3382                        wait=False,
3383                    )
3384                break
3385
3386        return
3387
3388    @tracer.start_as_current_span("TableOperations::updateTable")
3389    def updateTable(
3390        self,
3391        update_col: str = "Id",
3392    ):
3393        """
3394        Method to update an existing table with a new column
3395
3396        Args:
3397            updateCol: column to index the old and new tables on
3398
3399        Returns:
3400           existingTableId: synID of the already existing table that had its metadata replaced
3401        """
3402        existing_table, existing_results = self.synStore.get_synapse_table(
3403            self.existingTableId
3404        )
3405
3406        self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col)
3407        # store table with existing etag data and impose restrictions as appropriate
3408        table_result = self.synStore.syn.store(
3409            Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag),
3410            isRestricted=self.restrict,
3411        )
3412        # We cannot store the Table to the `synapse_entity_tracker` because there is
3413        # not `Schema` on the table object. The above `.store()` function call would
3414        # also update the ETag of the entity within Synapse. Remove it from the tracker
3415        # and re-retrieve it later on if needed again.
3416        self.synapse_entity_tracker.remove(synapse_id=table_result.tableId)
3417
3418        return self.existingTableId
3419
3420
3421class DatasetFileView:
3422    """Helper class to create temporary dataset file views.
3423    This class can be used in conjunction with a 'with' statement.
3424    This will ensure that the file view is deleted automatically.
3425    See SynapseStorage.getDatasetAnnotationsBatch for example usage.
3426    """
3427
3428    def __init__(
3429        self,
3430        datasetId: str,
3431        synapse: Synapse,
3432        name: str = None,
3433        temporary: bool = True,
3434        parentId: str = None,
3435    ) -> None:
3436        """Create a file view scoped to a dataset folder.
3437
3438        Args:
3439            datasetId (str): Synapse ID for a dataset folder/project.
3440            synapse (Synapse): Used for Synapse requests.
3441            name (str): Name of the file view (temporary or not).
3442            temporary (bool): Whether to delete the file view on exit
3443                of either a 'with' statement or Python entirely.
3444            parentId (str, optional): Synapse ID specifying where to
3445                store the file view. Defaults to datasetId.
3446        """
3447
3448        self.datasetId = datasetId
3449        self.synapse = synapse
3450        self.is_temporary = temporary
3451
3452        if name is None:
3453            self.name = f"schematic annotation file view for {self.datasetId}"
3454
3455        if self.is_temporary:
3456            uid = secrets.token_urlsafe(5)
3457            self.name = f"{self.name} - UID {uid}"
3458
3459        # TODO: Allow a DCC admin to configure a "universal parent"
3460        #       Such as a Synapse project writeable by everyone.
3461        self.parentId = datasetId if parentId is None else parentId
3462
3463        # TODO: Create local sharing setting to hide from everyone else
3464        view_schema = EntityViewSchema(
3465            name=self.name,
3466            parent=self.parentId,
3467            scopes=self.datasetId,
3468            includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER],
3469            addDefaultViewColumns=False,
3470            addAnnotationColumns=True,
3471        )
3472
3473        # TODO: Handle failure due to insufficient permissions by
3474        #       creating a temporary new project to store view
3475        self.view_schema = self.synapse.store(view_schema)
3476
3477        # These are filled in after calling `self.query()`
3478        self.results = None
3479        self.table = None
3480
3481        # Ensure deletion of the file view (last resort)
3482        if self.is_temporary:
3483            atexit.register(self.delete)
3484
3485    def __enter__(self):
3486        """Return file view when entering 'with' statement."""
3487        return self
3488
3489    def __exit__(self, exc_type, exc_value, traceback):
3490        """Delete file view when exiting 'with' statement."""
3491        if self.is_temporary:
3492            self.delete()
3493
3494    def delete(self):
3495        """Delete the file view on Synapse without deleting local table."""
3496        if self.view_schema is not None:
3497            self.synapse.delete(self.view_schema)
3498            self.view_schema = None
3499
3500    def query(self, tidy=True, force=False):
3501        """Retrieve file view as a data frame (raw format sans index)."""
3502        if self.table is None or force:
3503            fileview_id = self.view_schema["id"]
3504            self.results = self.synapse.tableQuery(f"select * from {fileview_id}")
3505            self.table = self.results.asDataFrame(
3506                rowIdAndVersionInIndex=False,
3507                na_values=STR_NA_VALUES_FILTERED,
3508                keep_default_na=False,
3509            )
3510        if tidy:
3511            self.tidy_table()
3512        return self.table
3513
3514    def tidy_table(self):
3515        """Convert raw file view data frame into more usable format."""
3516        assert self.table is not None, "Must call `self.query()` first."
3517        self._fix_default_columns()
3518        self._fix_list_columns()
3519        self._fix_int_columns()
3520        return self.table
3521
3522    def _fix_default_columns(self):
3523        """Rename default columns to match schematic expectations."""
3524
3525        # Drop ROW_VERSION column if present
3526        if "ROW_VERSION" in self.table:
3527            del self.table["ROW_VERSION"]
3528
3529        # Rename id column to entityId and set as data frame index
3530        if "ROW_ID" in self.table:
3531            self.table["entityId"] = "syn" + self.table["ROW_ID"].astype(str)
3532            self.table = self.table.set_index("entityId", drop=False)
3533            del self.table["ROW_ID"]
3534
3535        # Rename ROW_ETAG column to eTag and place at end of data frame
3536        if "ROW_ETAG" in self.table:
3537            row_etags = self.table.pop("ROW_ETAG")
3538
3539            # eTag column may already present if users annotated data without submitting manifest
3540            # we're only concerned with the new values and not the existing ones
3541            if "eTag" in self.table:
3542                del self.table["eTag"]
3543
3544            self.table.insert(len(self.table.columns), "eTag", row_etags)
3545
3546        return self.table
3547
3548    def _get_columns_of_type(self, types):
3549        """Helper function to get list of columns of a given type(s)."""
3550        matching_columns = []
3551        for header in self.results.headers:
3552            if header.columnType in types:
3553                matching_columns.append(header.name)
3554        return matching_columns
3555
3556    def _fix_list_columns(self):
3557        """Fix formatting of list-columns."""
3558        list_types = {"STRING_LIST", "INTEGER_LIST", "BOOLEAN_LIST"}
3559        list_columns = self._get_columns_of_type(list_types)
3560        for col in list_columns:
3561            self.table[col] = self.table[col].apply(lambda x: ", ".join(x))
3562        return self.table
3563
3564    def _fix_int_columns(self):
3565        """Ensure that integer-columns are actually integers."""
3566        int_columns = self._get_columns_of_type({"INTEGER"})
3567        for col in int_columns:
3568            # Coercing to string because NaN is a floating point value
3569            # and cannot exist alongside integers in a column
3570            def to_int_fn(x):
3571                return "" if np.isnan(x) else str(int(x))
3572
3573            self.table[col] = self.table[col].apply(to_int_fn)
3574        return self.table
logger = <Logger Synapse storage (WARNING)>
tracer = <opentelemetry.sdk.trace.Tracer object>
ID_COLUMN = 'Id'
ENTITY_ID_COLUMN = 'entityId'
UUID_COLUMN = 'uuid'
@dataclass
class ManifestDownload:
 91@dataclass
 92class ManifestDownload(object):
 93    """
 94    syn: an object of type synapseclient.
 95    manifest_id: id of a manifest
 96    synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
 97    """
 98
 99    syn: synapseclient.Synapse
100    manifest_id: str
101    synapse_entity_tracker: SynapseEntityTracker = field(
102        default_factory=SynapseEntityTracker
103    )
104
105    def _download_manifest_to_folder(self, use_temporary_folder: bool = True) -> File:
106        """
107        Try downloading a manifest to a specific folder (temporary or not). When the
108        `use_temporary_folder` is set to True, the manifest will be downloaded to a
109        temporary folder. This is useful for when the code is running as an API server
110        where multiple requests are being made at the same time. This will prevent
111        multiple requests from overwriting the same manifest file. When the
112        `use_temporary_folder` is set to False, the manifest will be downloaded to the
113        default manifest folder.
114
115        Args:
116            use_temporary_folder: boolean argument indicating if a temporary folder
117                should be used to store the manifest file. This is useful when running
118                this code as an API server where multiple requests could be made at the
119                same time. This is set to False when the code is being used from the
120                CLI. Defaults to True.
121
122        Return:
123            manifest_data: A Synapse file entity of the downloaded manifest
124        """
125        manifest_data = self.synapse_entity_tracker.get(
126            synapse_id=self.manifest_id,
127            syn=self.syn,
128            download_file=False,
129            retrieve_if_not_present=False,
130        )
131        current_span = trace.get_current_span()
132        if (
133            manifest_data
134            and (file_handle := manifest_data.get("_file_handle", None))
135            and current_span.is_recording()
136        ):
137            current_span.set_attribute(
138                "schematic.manifest_size", file_handle.get("contentSize", 0)
139            )
140
141        if manifest_data and manifest_data.path:
142            return manifest_data
143
144        if "SECRETS_MANAGER_SECRETS" in os.environ:
145            temporary_manifest_storage = "/var/tmp/temp_manifest_download"
146            cleanup_temporary_storage(
147                temporary_manifest_storage, time_delta_seconds=3600
148            )
149            # create a new directory to store manifest
150            if not os.path.exists(temporary_manifest_storage):
151                os.mkdir(temporary_manifest_storage)
152            # create temporary folders for storing manifests
153            download_location = create_temp_folder(
154                path=temporary_manifest_storage,
155                prefix=f"{self.manifest_id}-{time.time()}-",
156            )
157        else:
158            if use_temporary_folder:
159                download_location = create_temp_folder(
160                    path=CONFIG.manifest_folder,
161                    prefix=f"{self.manifest_id}-{time.time()}-",
162                )
163            else:
164                download_location = CONFIG.manifest_folder
165
166        manifest_data = self.synapse_entity_tracker.get(
167            synapse_id=self.manifest_id,
168            syn=self.syn,
169            download_file=True,
170            retrieve_if_not_present=True,
171            download_location=download_location,
172        )
173
174        # This is doing a rename of the downloaded file. The reason this is important
175        # is that if we are re-using a file that was previously downloaded, but the
176        # file had been renamed. The file downloaded from the Synapse client is just
177        # a direct copy of that renamed file. This code will set the name of the file
178        # to the original name that was used to download the file. Note: An MD5 checksum
179        # of the file will still be performed so if the file has changed, it will be
180        # downloaded again.
181        filename = manifest_data._file_handle.fileName
182        if filename != os.path.basename(manifest_data.path):
183            parent_folder = os.path.dirname(manifest_data.path)
184            manifest_original_name_and_path = os.path.join(parent_folder, filename)
185
186            self.syn.cache.remove(
187                file_handle_id=manifest_data.dataFileHandleId, path=manifest_data.path
188            )
189            os.rename(manifest_data.path, manifest_original_name_and_path)
190            manifest_data.path = manifest_original_name_and_path
191            self.syn.cache.add(
192                file_handle_id=manifest_data.dataFileHandleId,
193                path=manifest_original_name_and_path,
194                md5=manifest_data._file_handle.contentMd5,
195            )
196
197        return manifest_data
198
199    def _entity_type_checking(self) -> str:
200        """
201        check the entity type of the id that needs to be downloaded
202        Return:
203             if the entity type is wrong, raise an error
204        """
205        # check the type of entity
206        entity_type = entity_type_mapping(
207            syn=self.syn,
208            entity_id=self.manifest_id,
209            synapse_entity_tracker=self.synapse_entity_tracker,
210        )
211        if entity_type != "file":
212            logger.error(
213                f"You are using entity type: {entity_type}. Please provide a file ID"
214            )
215
216    def download_manifest(
217        self,
218        newManifestName: str = "",
219        manifest_df: pd.DataFrame = pd.DataFrame(),
220        use_temporary_folder: bool = True,
221    ) -> Union[str, File]:
222        """
223        Download a manifest based on a given manifest id.
224        Args:
225            newManifestName(optional): new name of a manifest that gets downloaded.
226            manifest_df(optional): a dataframe containing name and id of manifests in a given asset view
227        Return:
228            manifest_data: synapse entity file object
229        """
230
231        # enables retrying if user does not have access to uncensored manifest
232        # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location
233        manifest_data = ""
234
235        # check entity type
236        self._entity_type_checking()
237
238        # download a manifest
239        try:
240            manifest_data = self._download_manifest_to_folder(
241                use_temporary_folder=use_temporary_folder
242            )
243        except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError):
244            # if there's an error getting an uncensored manifest, try getting the censored manifest
245            if not manifest_df.empty:
246                censored_regex = re.compile(".*censored.*")
247                censored = manifest_df["name"].str.contains(censored_regex)
248                new_manifest_id = manifest_df[censored]["id"][0]
249                self.manifest_id = new_manifest_id
250                try:
251                    manifest_data = self._download_manifest_to_folder(
252                        use_temporary_folder=use_temporary_folder
253                    )
254                except (
255                    SynapseUnmetAccessRestrictions,
256                    SynapseAuthenticationError,
257                ) as e:
258                    raise PermissionError(
259                        "You don't have access to censored and uncensored manifests in this dataset."
260                    ) from e
261            else:
262                logger.error(
263                    f"You don't have access to the requested resource: {self.manifest_id}"
264                )
265
266        if newManifestName and os.path.exists(manifest_data.get("path")):
267            # Rename the file we just made to the new name
268            new_manifest_filename = newManifestName + ".csv"
269
270            # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest.
271            parent_folder = os.path.dirname(manifest_data.get("path"))
272
273            new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename)
274
275            # Copy file to new location. The purpose of using a copy instead of a rename
276            # is to avoid any potential issues with the file being used in another
277            # process. This avoids any potential race or code cocurrency conditions.
278            shutil.copyfile(src=manifest_data["path"], dst=new_manifest_path_name)
279
280            # Adding this to cache will allow us to re-use the already downloaded
281            # manifest file for up to 1 hour.
282            self.syn.cache.add(
283                file_handle_id=manifest_data.dataFileHandleId,
284                path=new_manifest_path_name,
285                md5=manifest_data._file_handle.contentMd5,
286            )
287
288            # Update file names/paths in manifest_data
289            manifest_data["name"] = new_manifest_filename
290            manifest_data["filename"] = new_manifest_filename
291            manifest_data["path"] = new_manifest_path_name
292
293        return manifest_data

syn: an object of type synapseclient. manifest_id: id of a manifest synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities

ManifestDownload( syn: synapseclient.client.Synapse, manifest_id: str, synapse_entity_tracker: schematic.store.synapse_tracker.SynapseEntityTracker = <factory>)
syn: synapseclient.client.Synapse
manifest_id: str
def download_manifest( self, newManifestName: str = '', manifest_df: pandas.core.frame.DataFrame = Empty DataFrame Columns: [] Index: [], use_temporary_folder: bool = True) -> Union[str, synapseclient.entity.File]:
216    def download_manifest(
217        self,
218        newManifestName: str = "",
219        manifest_df: pd.DataFrame = pd.DataFrame(),
220        use_temporary_folder: bool = True,
221    ) -> Union[str, File]:
222        """
223        Download a manifest based on a given manifest id.
224        Args:
225            newManifestName(optional): new name of a manifest that gets downloaded.
226            manifest_df(optional): a dataframe containing name and id of manifests in a given asset view
227        Return:
228            manifest_data: synapse entity file object
229        """
230
231        # enables retrying if user does not have access to uncensored manifest
232        # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location
233        manifest_data = ""
234
235        # check entity type
236        self._entity_type_checking()
237
238        # download a manifest
239        try:
240            manifest_data = self._download_manifest_to_folder(
241                use_temporary_folder=use_temporary_folder
242            )
243        except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError):
244            # if there's an error getting an uncensored manifest, try getting the censored manifest
245            if not manifest_df.empty:
246                censored_regex = re.compile(".*censored.*")
247                censored = manifest_df["name"].str.contains(censored_regex)
248                new_manifest_id = manifest_df[censored]["id"][0]
249                self.manifest_id = new_manifest_id
250                try:
251                    manifest_data = self._download_manifest_to_folder(
252                        use_temporary_folder=use_temporary_folder
253                    )
254                except (
255                    SynapseUnmetAccessRestrictions,
256                    SynapseAuthenticationError,
257                ) as e:
258                    raise PermissionError(
259                        "You don't have access to censored and uncensored manifests in this dataset."
260                    ) from e
261            else:
262                logger.error(
263                    f"You don't have access to the requested resource: {self.manifest_id}"
264                )
265
266        if newManifestName and os.path.exists(manifest_data.get("path")):
267            # Rename the file we just made to the new name
268            new_manifest_filename = newManifestName + ".csv"
269
270            # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest.
271            parent_folder = os.path.dirname(manifest_data.get("path"))
272
273            new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename)
274
275            # Copy file to new location. The purpose of using a copy instead of a rename
276            # is to avoid any potential issues with the file being used in another
277            # process. This avoids any potential race or code cocurrency conditions.
278            shutil.copyfile(src=manifest_data["path"], dst=new_manifest_path_name)
279
280            # Adding this to cache will allow us to re-use the already downloaded
281            # manifest file for up to 1 hour.
282            self.syn.cache.add(
283                file_handle_id=manifest_data.dataFileHandleId,
284                path=new_manifest_path_name,
285                md5=manifest_data._file_handle.contentMd5,
286            )
287
288            # Update file names/paths in manifest_data
289            manifest_data["name"] = new_manifest_filename
290            manifest_data["filename"] = new_manifest_filename
291            manifest_data["path"] = new_manifest_path_name
292
293        return manifest_data

Download a manifest based on a given manifest id.

Arguments:
  • newManifestName(optional): new name of a manifest that gets downloaded.
  • manifest_df(optional): a dataframe containing name and id of manifests in a given asset view
Return:

manifest_data: synapse entity file object

class SynapseStorage(schematic.store.base.BaseStorage):
 296class SynapseStorage(BaseStorage):
 297    """Implementation of Storage interface for datasets/files stored on Synapse.
 298    Provides utilities to list files in a specific project; update files annotations, create fileviews, etc.
 299
 300    TODO: Need to define the interface and rename and/or refactor some of the methods below.
 301    """
 302
 303    @tracer.start_as_current_span("SynapseStorage::__init__")
 304    def __init__(
 305        self,
 306        token: Optional[str] = None,  # optional parameter retrieved from browser cookie
 307        access_token: Optional[str] = None,
 308        project_scope: Optional[list] = None,
 309        synapse_cache_path: Optional[str] = None,
 310        perform_query: Optional[bool] = True,
 311        columns: Optional[list] = None,
 312        where_clauses: Optional[list] = None,
 313    ) -> None:
 314        """Initializes a SynapseStorage object.
 315
 316        Args:
 317            token (Optional[str], optional):
 318              Optional token parameter as found in browser cookie upon login to synapse.
 319              Defaults to None.
 320            access_token (Optional[list], optional):
 321              Optional access token (personal or oauth).
 322              Defaults to None.
 323            project_scope (Optional[list], optional): Defaults to None.
 324            synapse_cache_path (Optional[str], optional):
 325              Location of synapse cache.
 326              Defaults to None.
 327        TODO:
 328            Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how `query_fileview` is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands.
 329        """
 330        self.syn = self.login(synapse_cache_path, access_token)
 331        self.project_scope = project_scope
 332        self.storageFileview = CONFIG.synapse_master_fileview_id
 333        self.manifest = CONFIG.synapse_manifest_basename
 334        self.root_synapse_cache = self.syn.cache.cache_root_dir
 335        self.synapse_entity_tracker = SynapseEntityTracker()
 336        if perform_query:
 337            self.query_fileview(columns=columns, where_clauses=where_clauses)
 338
 339    # TODO: When moving this over to a regular cron-job the following logic should be
 340    # out of `manifest_download`:
 341    # if "SECRETS_MANAGER_SECRETS" in os.environ:
 342    #     temporary_manifest_storage = "/var/tmp/temp_manifest_download"
 343    #     cleanup_temporary_storage(temporary_manifest_storage, time_delta_seconds=3600)
 344    @tracer.start_as_current_span("SynapseStorage::_purge_synapse_cache")
 345    def _purge_synapse_cache(
 346        self, maximum_storage_allowed_cache_gb: int = 1, minute_buffer: int = 15
 347    ) -> None:
 348        """
 349        Purge synapse cache if it exceeds a certain size. Default to 1GB.
 350        Args:
 351            maximum_storage_allowed_cache_gb (int): the maximum storage allowed
 352              before purging cache. Default is 1 GB.
 353            minute_buffer (int): All files created this amount of time or older will be deleted
 354        """
 355        # try clearing the cache
 356        # scan a directory and check size of files
 357        if os.path.exists(self.root_synapse_cache):
 358            maximum_storage_allowed_cache_bytes = maximum_storage_allowed_cache_gb * (
 359                1024**3
 360            )
 361            nbytes = get_dir_size(self.root_synapse_cache)
 362            dir_size_bytes = check_synapse_cache_size(directory=self.root_synapse_cache)
 363            # if 1 GB has already been taken, purge cache before 15 min
 364            if dir_size_bytes >= maximum_storage_allowed_cache_bytes:
 365                num_of_deleted_files = clear_synapse_cache(
 366                    self.syn.cache, minutes=minute_buffer
 367                )
 368                logger.info(
 369                    f"{num_of_deleted_files}  files have been deleted from {self.root_synapse_cache}"
 370                )
 371            else:
 372                # on AWS, OS takes around 14-17% of our ephemeral storage (20GiB)
 373                # instead of guessing how much space that we left, print out .synapseCache here
 374                logger.info(f"the total size of .synapseCache is: {nbytes} bytes")
 375
 376    @tracer.start_as_current_span("SynapseStorage::query_fileview")
 377    def query_fileview(
 378        self,
 379        columns: Optional[list] = None,
 380        where_clauses: Optional[list] = None,
 381        force_requery: Optional[bool] = False,
 382    ) -> None:
 383        """
 384        Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute.
 385        Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes.
 386        Args:
 387            columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
 388            where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
 389            force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False.
 390        """
 391        self._purge_synapse_cache()
 392
 393        # Initialize to assume that the new fileview query will be different from what may already be stored. Initializes to True because generally one will not have already been performed
 394        self.new_query_different = True
 395
 396        # If a query has already been performed, store the query
 397        previous_query_built = hasattr(self, "fileview_query")
 398        if previous_query_built:
 399            previous_query = self.fileview_query
 400
 401        # Build a query with the current given parameters and check to see if it is different from the previous
 402        self._build_query(columns=columns, where_clauses=where_clauses)
 403        if previous_query_built:
 404            self.new_query_different = self.fileview_query != previous_query
 405
 406        # Only perform the query if it is different from the previous query or we are forcing new results to be retrieved
 407        if self.new_query_different or force_requery:
 408            try:
 409                self.storageFileviewTable = self.syn.tableQuery(
 410                    query=self.fileview_query,
 411                ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
 412            except SynapseHTTPError as exc:
 413                exception_text = str(exc)
 414                if "Unknown column path" in exception_text:
 415                    raise ValueError(
 416                        "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation."
 417                    )
 418                elif "Unknown column" in exception_text:
 419                    missing_column = exception_text.split("Unknown column ")[-1]
 420                    raise ValueError(
 421                        f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview."
 422                    )
 423                else:
 424                    raise AccessCredentialsError(self.storageFileview)
 425
 426    @staticmethod
 427    def build_clause_from_dataset_id(
 428        dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None
 429    ) -> str:
 430        """
 431        Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized.
 432        Args:
 433            dataset_id: Synapse ID of a dataset that should be used to limit the query
 434            dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query
 435        Returns:
 436            clause for the query or an empty string if no dataset ID is provided
 437        """
 438        # Calling this method without specifying synIDs will complete but will not scope the view
 439        if (not dataset_id) and (not dataset_folder_list):
 440            return ""
 441
 442        # This will be used to gather files under a dataset recursively with a fileview query instead of walking
 443        if dataset_folder_list:
 444            search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list)
 445            return f"parentId IN ({search_folders})"
 446
 447        # `dataset_id` should be provided when all files are stored directly under the dataset folder
 448        return f"parentId='{dataset_id}'"
 449
 450    def _build_query(
 451        self, columns: Optional[list] = None, where_clauses: Optional[list] = None
 452    ):
 453        """
 454        Method to build a query for Synapse FileViews
 455        Args:
 456            columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
 457            where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
 458            self.storageFileview (str): Synapse FileView ID
 459            self.project_scope (Optional[list], optional): List of project IDs to be used to scope the query. Defaults to None.
 460                Gets added to where_clauses, more included for backwards compatability and as a more user friendly way of subsetting the view in a simple way.
 461        """
 462        if columns is None:
 463            columns = []
 464        if where_clauses is None:
 465            where_clauses = []
 466
 467        if self.project_scope:
 468            project_scope_clause = f"projectId IN {tuple(self.project_scope + [''])}"
 469            where_clauses.append(project_scope_clause)
 470
 471        if where_clauses:
 472            where_clauses = " AND ".join(where_clauses)
 473            where_clauses = f"WHERE {where_clauses} ;"
 474        else:
 475            where_clauses = ";"
 476
 477        if columns:
 478            columns = ",".join(columns)
 479        else:
 480            columns = "*"
 481
 482        self.fileview_query = (
 483            f"SELECT {columns} FROM {self.storageFileview} {where_clauses}"
 484        )
 485
 486        return
 487
 488    @staticmethod
 489    @tracer.start_as_current_span("SynapseStorage::login")
 490    def login(
 491        synapse_cache_path: Optional[str] = None,
 492        access_token: Optional[str] = None,
 493    ) -> synapseclient.Synapse:
 494        """Login to Synapse
 495
 496        Args:
 497            access_token (Optional[str], optional): A synapse access token. Defaults to None.
 498            synapse_cache_path (Optional[str]): location of synapse cache
 499
 500        Raises:
 501            ValueError: If unable to loging with access token
 502
 503        Returns:
 504            synapseclient.Synapse: A Synapse object that is logged in
 505        """
 506        if not access_token:
 507            access_token = os.getenv("SYNAPSE_ACCESS_TOKEN")
 508
 509        # login using a token
 510        if access_token:
 511            try:
 512                syn = synapseclient.Synapse(
 513                    cache_root_dir=synapse_cache_path,
 514                    debug=False,
 515                    skip_checks=True,
 516                    cache_client=False,
 517                )
 518                syn.login(authToken=access_token, silent=True)
 519            except SynapseHTTPError as exc:
 520                raise ValueError(
 521                    "No access to resources. Please make sure that your token is correct"
 522                ) from exc
 523        else:
 524            # login using synapse credentials provided by user in .synapseConfig (default) file
 525            syn = synapseclient.Synapse(
 526                configPath=CONFIG.synapse_configuration_path,
 527                cache_root_dir=synapse_cache_path,
 528                debug=False,
 529                skip_checks=True,
 530                cache_client=False,
 531            )
 532            syn.login(silent=True)
 533
 534        # set user id attribute
 535        current_span = trace.get_current_span()
 536        if current_span.is_recording():
 537            current_span.set_attribute("user.id", syn.credentials.owner_id)
 538
 539        return syn
 540
 541    def missing_entity_handler(method):
 542        def wrapper(*args, **kwargs):
 543            try:
 544                return method(*args, **kwargs)
 545            except SynapseHTTPError as ex:
 546                str_message = str(ex).replace("\n", "")
 547                if "trash" in str_message or "does not exist" in str_message:
 548                    logging.warning(str_message)
 549                    return None
 550                else:
 551                    raise ex
 552
 553        return wrapper
 554
 555    def async_missing_entity_handler(method):
 556        """Decorator to handle missing entities in async methods."""
 557
 558        async def wrapper(*args: Any, **kwargs: Any) -> Any:
 559            try:
 560                return await method(*args, **kwargs)
 561            except SynapseHTTPError as ex:
 562                str_message = str(ex).replace("\n", "")
 563                if "trash" in str_message or "does not exist" in str_message:
 564                    logging.warning(str_message)
 565                    return None
 566                else:
 567                    raise ex
 568
 569        return wrapper
 570
 571    def getStorageFileviewTable(self):
 572        """Returns the storageFileviewTable obtained during initialization."""
 573        return self.storageFileviewTable
 574
 575    def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]:
 576        """Gets the paginated results of the REST call to Synapse to check what projects the current user has access to.
 577
 578        Args:
 579            currentUserId: synapse id for the user whose projects we want to get.
 580
 581        Returns:
 582            A dictionary with a next page token and the results.
 583        """
 584        all_results = self.syn.restGET(
 585            "/projects/user/{principalId}".format(principalId=currentUserId)
 586        )
 587
 588        while (
 589            "nextPageToken" in all_results
 590        ):  # iterate over next page token in results while there is any
 591            results_token = self.syn.restGET(
 592                "/projects/user/{principalId}?nextPageToken={nextPageToken}".format(
 593                    principalId=currentUserId,
 594                    nextPageToken=all_results["nextPageToken"],
 595                )
 596            )
 597            all_results["results"].extend(results_token["results"])
 598
 599            if "nextPageToken" in results_token:
 600                all_results["nextPageToken"] = results_token["nextPageToken"]
 601            else:
 602                del all_results["nextPageToken"]
 603
 604        return all_results
 605
 606    @tracer.start_as_current_span("SynapseStorage::getStorageProjects")
 607    def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]:
 608        """Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute.
 609
 610        Returns:
 611            A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName).
 612        """
 613
 614        # get the set of all storage Synapse project accessible for this pipeline
 615        storageProjects = self.storageFileviewTable["projectId"].unique()
 616
 617        # get the set of storage Synapse project accessible for this user
 618        # get a list of projects from Synapse
 619        current_user_project_headers = self.synapse_entity_tracker.get_project_headers(
 620            current_user_id=self.syn.credentials.owner_id, syn=self.syn
 621        )
 622        project_id_to_name_dict = {}
 623        current_user_projects = []
 624        for project_header in current_user_project_headers:
 625            project_id_to_name_dict[project_header.get("id")] = project_header.get(
 626                "name"
 627            )
 628            current_user_projects.append(project_header.get("id"))
 629
 630        # find set of user projects that are also in this pipeline's storage projects set
 631        storageProjects = list(set(storageProjects) & set(current_user_projects))
 632
 633        # Limit projects to scope if specified
 634        if project_scope:
 635            storageProjects = list(set(storageProjects) & set(project_scope))
 636
 637            if not storageProjects:
 638                raise Warning(
 639                    f"There are no projects that the user has access to that match the criteria of the specified project scope: {project_scope}"
 640                )
 641
 642        # prepare a return list of project IDs and names
 643        projects = []
 644        for projectId in storageProjects:
 645            project_name_from_project_header = project_id_to_name_dict.get(projectId)
 646            projects.append((projectId, project_name_from_project_header))
 647
 648        sorted_projects_list = sorted(projects, key=lambda tup: tup[0])
 649
 650        return sorted_projects_list
 651
 652    @tracer.start_as_current_span("SynapseStorage::getStorageDatasetsInProject")
 653    def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]:
 654        """Gets all datasets in folder under a given storage project that the current user has access to.
 655
 656        Args:
 657            projectId: synapse ID of a storage project.
 658
 659        Returns:
 660            A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName).
 661            None: If the projectId cannot be found on Synapse.
 662        """
 663
 664        # select all folders and fetch their names from within the storage project;
 665        # if folder content type is defined, only select folders that contain datasets
 666        if "contentType" in self.storageFileviewTable.columns:
 667            foldersTable = self.storageFileviewTable[
 668                (self.storageFileviewTable["contentType"] == "dataset")
 669                & (self.storageFileviewTable["projectId"] == projectId)
 670            ]
 671        else:
 672            foldersTable = self.storageFileviewTable[
 673                (self.storageFileviewTable["type"] == "folder")
 674                & (self.storageFileviewTable["parentId"] == projectId)
 675            ]
 676
 677        # get an array of tuples (folderId, folderName)
 678        # some folders are part of datasets; others contain datasets
 679        # each dataset parent is the project; folders part of a dataset have another folder as a parent
 680        # to get folders if and only if they contain datasets for each folder
 681        # check if folder's parent is the project; if so that folder contains a dataset,
 682        # unless the folder list has already been filtered to dataset folders based on contentType attribute above
 683
 684        datasetList = []
 685        folderProperties = ["id", "name"]
 686        for folder in list(
 687            foldersTable[folderProperties].itertuples(index=False, name=None)
 688        ):
 689            datasetList.append(folder)
 690
 691        sorted_dataset_list = sorted(datasetList, key=lambda tup: tup[0])
 692
 693        return sorted_dataset_list
 694
 695    @tracer.start_as_current_span("SynapseStorage::getFilesInStorageDataset")
 696    def getFilesInStorageDataset(
 697        self, datasetId: str, fileNames: List = None, fullpath: bool = True
 698    ) -> List[Tuple[str, str]]:
 699        """Gets all files (excluding manifest files) in a given dataset folder.
 700
 701        Args:
 702            datasetId: synapse ID of a storage dataset.
 703            fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g.
 704            metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present.
 705            fullpath: if True return the full path as part of this filename; otherwise return just base filename
 706
 707        Returns:
 708            A list of files; the list consists of tuples (fileId, fileName).
 709
 710        Raises:
 711            ValueError: Dataset ID not found.
 712        """
 713        file_list = []
 714
 715        # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view
 716        if self.storageFileviewTable.empty:
 717            raise ValueError(
 718                f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again."
 719            )
 720        child_path = self.storageFileviewTable.loc[
 721            self.storageFileviewTable["parentId"] == datasetId, "path"
 722        ]
 723        if child_path.empty:
 724            raise LookupError(
 725                f"Dataset {datasetId} could not be found in fileview {self.storageFileview}."
 726            )
 727        child_path = child_path.iloc[0]
 728
 729        # Get the dataset path by eliminating the child's portion of the path to account for nested datasets
 730        parent = child_path.split("/")[:-1]
 731        parent = "/".join(parent)
 732
 733        # When querying, only include files to exclude entity files and subdirectories
 734        where_clauses = [create_like_statement(parent), "type='file'"]
 735
 736        # Requery the fileview to specifically get the files in the given dataset
 737        self.query_fileview(columns=["id", "path"], where_clauses=where_clauses)
 738
 739        # Exclude manifest files
 740        non_manifest_files = self.storageFileviewTable.loc[
 741            ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"),
 742            :,
 743        ]
 744
 745        # Remove all files that are not in the list of fileNames
 746        if fileNames:
 747            filename_regex = "|".join(fileNames)
 748
 749            matching_files = non_manifest_files["path"].str.contains(
 750                filename_regex, case=False, regex=True
 751            )
 752
 753            non_manifest_files = non_manifest_files.loc[matching_files, :]
 754
 755        # Truncate path if necessary
 756        if not fullpath:
 757            non_manifest_files.path = non_manifest_files.path.apply(os.path.basename)
 758
 759        # Return list of files as expected by other methods
 760        file_list = list(non_manifest_files.itertuples(index=False, name=None))
 761
 762        return file_list
 763
 764    def _get_manifest_id(self, manifest: pd.DataFrame) -> str:
 765        """If both censored and uncensored manifests are present, return uncensored manifest; if only one manifest is present, return manifest id of that manifest; if more than two manifests are present, return the manifest id of the first one.
 766        Args:
 767        manifest: a dataframe contains name and id of manifests in a given asset view
 768
 769        Return:
 770        manifest_syn_id: id of a given censored or uncensored manifest
 771        """
 772        censored_regex = re.compile(".*censored.*")
 773        censored = manifest["name"].str.contains(censored_regex)
 774        if any(censored):
 775            # Try to use uncensored manifest first
 776            not_censored = ~censored
 777            if any(not_censored):
 778                manifest_syn_id = manifest[not_censored]["id"].iloc[0]
 779            # if only censored manifests are available, just use the first censored manifest
 780            else:
 781                manifest_syn_id = manifest["id"].iloc[0]
 782
 783        # otherwise, use the first (implied only) version that exists
 784        else:
 785            manifest_syn_id = manifest["id"].iloc[0]
 786
 787        return manifest_syn_id
 788
 789    @tracer.start_as_current_span("SynapseStorage::getDatasetManifest")
 790    def getDatasetManifest(
 791        self,
 792        datasetId: str,
 793        downloadFile: bool = False,
 794        newManifestName: str = "",
 795        use_temporary_folder: bool = True,
 796    ) -> Union[str, File]:
 797        """Gets the manifest associated with a given dataset.
 798
 799        Args:
 800            datasetId: synapse ID of a storage dataset.
 801            downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not.
 802            newManifestName: new name of a manifest that gets downloaded
 803            use_temporary_folder: boolean argument indicating if a temporary folder
 804                should be used to store the manifest file. This is useful when running
 805                this code as an API server where multiple requests could be made at the
 806                same time. This is set to False when the code is being used from the
 807                CLI. Defaults to True.
 808
 809        Returns:
 810            manifest_syn_id (String): Synapse ID of exisiting manifest file.
 811            manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True.
 812            "" (String): No pre-exisiting manifest in dataset.
 813        """
 814        manifest_data = ""
 815
 816        # get a list of files containing the manifest for this dataset (if any)
 817        all_files = self.storageFileviewTable
 818
 819        # construct regex based on manifest basename in the config
 820        manifest_re = re.compile(os.path.basename(self.manifest) + ".*.[tc]sv")
 821
 822        # search manifest based on given manifest basename regex above
 823        # and return a dataframe containing name and id of manifests in a given asset view
 824        manifest = all_files[
 825            (all_files["name"].str.contains(manifest_re, regex=True))
 826            & (all_files["parentId"] == datasetId)
 827        ]
 828
 829        manifest = manifest[["id", "name"]]
 830
 831        # if there is no pre-exisiting manifest in the specified dataset
 832        if manifest.empty:
 833            logger.warning(
 834                f"Could not find a manifest that fits basename {self.manifest} in asset view and dataset {datasetId}"
 835            )
 836            return ""
 837
 838        # if there is an exisiting manifest
 839        else:
 840            manifest_syn_id = self._get_manifest_id(manifest)
 841            if downloadFile:
 842                md = ManifestDownload(
 843                    self.syn,
 844                    manifest_id=manifest_syn_id,
 845                    synapse_entity_tracker=self.synapse_entity_tracker,
 846                )
 847                manifest_data = md.download_manifest(
 848                    newManifestName=newManifestName,
 849                    manifest_df=manifest,
 850                    use_temporary_folder=use_temporary_folder,
 851                )
 852                # TO DO: revisit how downstream code handle manifest_data. If the downstream code would break when manifest_data is an empty string,
 853                # then we should catch the error here without returning an empty string.
 854                if not manifest_data:
 855                    logger.debug(
 856                        f"No manifest data returned. Please check if you have successfully downloaded manifest: {manifest_syn_id}"
 857                    )
 858                return manifest_data
 859            return manifest_syn_id
 860
 861    def getDataTypeFromManifest(self, manifestId: str):
 862        """Fetch a manifest and return data types of all columns
 863        Args:
 864            manifestId: synapse ID of a manifest
 865        """
 866        # get manifest file path
 867        manifest_entity = self.synapse_entity_tracker.get(
 868            synapse_id=manifestId, syn=self.syn, download_file=True
 869        )
 870        manifest_filepath = manifest_entity.path
 871
 872        # load manifest dataframe
 873        manifest = load_df(
 874            manifest_filepath,
 875            preserve_raw_input=False,
 876            data_model=False,
 877        )
 878
 879        # convert the dataFrame to use best possible dtypes.
 880        manifest_new = manifest.convert_dtypes()
 881
 882        # get data types of columns
 883        result = manifest_new.dtypes.to_frame("dtypes").reset_index()
 884
 885        # return the result as a dictionary
 886        result_dict = result.set_index("index")["dtypes"].astype(str).to_dict()
 887
 888        return result_dict
 889
 890    def _get_files_metadata_from_dataset(
 891        self, datasetId: str, only_new_files: bool, manifest: pd.DataFrame = None
 892    ) -> Optional[dict]:
 893        """retrieve file ids under a particular datasetId
 894
 895        Args:
 896            datasetId (str): a dataset id
 897            only_new_files (bool): if only adding new files that are not already exist
 898            manifest (pd.DataFrame): metadata manifest dataframe. Default to None.
 899
 900        Returns:
 901            a dictionary that contains filename and entityid under a given datasetId or None if there is nothing under a given dataset id are not available
 902        """
 903        dataset_files = self.getFilesInStorageDataset(datasetId)
 904        if dataset_files:
 905            dataset_file_names_id_dict = self._get_file_entityIds(
 906                dataset_files, only_new_files=only_new_files, manifest=manifest
 907            )
 908            return dataset_file_names_id_dict
 909        else:
 910            return None
 911
 912    def add_entity_id_and_filename(
 913        self, datasetId: str, manifest: pd.DataFrame
 914    ) -> pd.DataFrame:
 915        """add entityid and filename column to an existing manifest assuming entityId column is not already present
 916
 917        Args:
 918            datasetId (str): dataset syn id
 919            manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty
 920
 921        Returns:
 922            pd.DataFrame: returns a pandas dataframe
 923        """
 924        # get file names and entity ids of a given dataset
 925        dataset_files_dict = self._get_files_metadata_from_dataset(
 926            datasetId, only_new_files=False
 927        )
 928
 929        if dataset_files_dict:
 930            # turn manifest dataframe back to a dictionary for operation
 931            manifest_dict = manifest.to_dict("list")
 932
 933            # update Filename column
 934            # add entityId column to the end
 935            manifest_dict.update(dataset_files_dict)
 936
 937            # if the component column exists in existing manifest, fill up that column
 938            if "Component" in manifest_dict.keys():
 939                manifest_dict["Component"] = manifest_dict["Component"] * max(
 940                    1, len(manifest_dict["Filename"])
 941                )
 942
 943            # turn dictionary back to a dataframe
 944            manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient="index")
 945            manifest_df_updated = manifest_df_index.transpose()
 946
 947            # fill na with empty string
 948            manifest_df_updated = manifest_df_updated.fillna("")
 949
 950            # drop index
 951            manifest_df_updated = manifest_df_updated.reset_index(drop=True)
 952
 953            return manifest_df_updated
 954        else:
 955            return manifest
 956
 957    def fill_in_entity_id_filename(
 958        self, datasetId: str, manifest: pd.DataFrame
 959    ) -> Tuple[List, pd.DataFrame]:
 960        """fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present.
 961
 962        Args:
 963            datasetId (str): dataset syn id
 964            manifest (pd.DataFrame): existing manifest dataframe.
 965
 966        Returns:
 967            Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe
 968        """
 969        # get dataset file names and entity id as a list of tuple
 970        dataset_files = self.getFilesInStorageDataset(datasetId)
 971
 972        # update manifest with additional filenames, if any
 973        # note that if there is an existing manifest and there are files in the dataset
 974        # the columns Filename and entityId are assumed to be present in manifest schema
 975        # TODO: use idiomatic panda syntax
 976        if not dataset_files:
 977            manifest = manifest.fillna("")
 978            return dataset_files, manifest
 979
 980        all_files = self._get_file_entityIds(
 981            dataset_files=dataset_files, only_new_files=False, manifest=manifest
 982        )
 983        new_files = self._get_file_entityIds(
 984            dataset_files=dataset_files, only_new_files=True, manifest=manifest
 985        )
 986
 987        all_files = pd.DataFrame(all_files)
 988        new_files = pd.DataFrame(new_files)
 989
 990        # update manifest so that it contains new dataset files
 991        manifest = (
 992            pd.concat([manifest, new_files], sort=False)
 993            .reset_index()
 994            .drop("index", axis=1)
 995        )
 996
 997        # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata
 998        manifest_reindex = manifest.set_index("entityId")
 999        all_files_reindex = all_files.set_index("entityId")
1000        all_files_reindex_like_manifest = all_files_reindex.reindex_like(
1001            manifest_reindex
1002        )
1003
1004        # Check if individual file paths in manifest and from synapse match
1005        file_paths_match = (
1006            manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"]
1007        )
1008
1009        # If all the paths do not match, update the manifest with the filepaths from synapse
1010        if not file_paths_match.all():
1011            manifest_reindex.loc[
1012                ~file_paths_match, "Filename"
1013            ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"]
1014
1015            # reformat manifest for further use
1016            manifest = manifest_reindex.reset_index()
1017            entityIdCol = manifest.pop("entityId")
1018            manifest.insert(len(manifest.columns), "entityId", entityIdCol)
1019
1020        manifest = manifest.fillna("")
1021        return dataset_files, manifest
1022
1023    @tracer.start_as_current_span("SynapseStorage::updateDatasetManifestFiles")
1024    def updateDatasetManifestFiles(
1025        self, dmge: DataModelGraphExplorer, datasetId: str, store: bool = True
1026    ) -> Union[Tuple[str, pd.DataFrame], None]:
1027        """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any.
1028
1029        Args:
1030            dmge: DataModelGraphExplorer Instance
1031            datasetId: synapse ID of a storage dataset.
1032            store: if set to True store updated manifest in asset store; if set to False
1033            return a Pandas dataframe containing updated manifest but do not store to asset store
1034
1035
1036        Returns:
1037            Synapse ID of updated manifest and Pandas dataframe containing the updated manifest.
1038            If there is no existing manifest or if the manifest does not have an entityId column, return None
1039        """
1040
1041        # get existing manifest Synapse ID
1042        manifest_id = self.getDatasetManifest(datasetId)
1043
1044        # if there is no manifest return None
1045        if not manifest_id:
1046            return None
1047
1048        manifest_entity = self.synapse_entity_tracker.get(
1049            synapse_id=manifest_id, syn=self.syn, download_file=True
1050        )
1051        manifest_filepath = manifest_entity.path
1052        manifest = load_df(manifest_filepath)
1053
1054        # If the manifest does not have an entityId column, trigger a new manifest to be generated
1055        if "entityId" not in manifest.columns:
1056            return None
1057
1058        manifest_is_file_based = "Filename" in manifest.columns
1059
1060        if manifest_is_file_based:
1061            # update manifest with additional filenames, if any
1062            # note that if there is an existing manifest and there are files in the dataset
1063            # the columns Filename and entityId are assumed to be present in manifest schema
1064            # TODO: use idiomatic panda syntax
1065            dataset_files, manifest = self.fill_in_entity_id_filename(
1066                datasetId, manifest
1067            )
1068            if dataset_files:
1069                # update the manifest file, so that it contains the relevant entity IDs
1070                if store:
1071                    manifest.to_csv(manifest_filepath, index=False)
1072
1073                    # store manifest and update associated metadata with manifest on Synapse
1074                    manifest_id = self.associateMetadataWithFiles(
1075                        dmge, manifest_filepath, datasetId
1076                    )
1077
1078        return manifest_id, manifest
1079
1080    def _get_file_entityIds(
1081        self,
1082        dataset_files: List,
1083        only_new_files: bool = False,
1084        manifest: pd.DataFrame = None,
1085    ):
1086        """
1087        Get a dictionary of files in a dataset. Either files that are not in the current manifest or all files
1088
1089        Args:
1090            manifest: metadata manifest
1091            dataset_file: List of all files in a dataset
1092            only_new_files: boolean to control whether only new files are returned or all files in the dataset
1093        Returns:
1094            files: dictionary of file names and entityIDs, with scope as specified by `only_new_files`
1095        """
1096        files = {"Filename": [], "entityId": []}
1097
1098        if only_new_files:
1099            if manifest is None:
1100                raise UnboundLocalError(
1101                    "No manifest was passed in, a manifest is required when `only_new_files` is True."
1102                )
1103
1104            if "entityId" not in manifest.columns:
1105                raise ValueError(
1106                    "The manifest in your dataset and/or top level folder must contain the 'entityId' column. "
1107                    "Please generate an empty manifest without annotations, manually add annotations to the "
1108                    "appropriate files in the manifest, and then try again."
1109                )
1110
1111            # find new files (that are not in the current manifest) if any
1112            for file_id, file_name in dataset_files:
1113                if not file_id in manifest["entityId"].values:
1114                    files["Filename"].append(file_name)
1115                    files["entityId"].append(file_id)
1116        else:
1117            # get all files
1118            for file_id, file_name in dataset_files:
1119                files["Filename"].append(file_name)
1120                files["entityId"].append(file_id)
1121
1122        return files
1123
1124    @tracer.start_as_current_span("SynapseStorage::getProjectManifests")
1125    def getProjectManifests(
1126        self, projectId: str
1127    ) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]:
1128        """Gets all metadata manifest files across all datasets in a specified project.
1129
1130        Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest
1131                 as a list of tuples, one for each manifest:
1132                    [
1133                        (
1134                            (datasetId, dataName),
1135                            (manifestId, manifestName),
1136                            (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema
1137                        ),
1138                        ...
1139                    ]
1140
1141        TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface
1142        """
1143        component = None
1144        entity = None
1145        manifests = []
1146
1147        datasets = self.getStorageDatasetsInProject(projectId)
1148
1149        for datasetId, datasetName in datasets:
1150            # encode information about the manifest in a simple list (so that R clients can unpack it)
1151            # eventually can serialize differently
1152
1153            # Get synID of manifest for a dataset
1154            manifestId = self.getDatasetManifest(datasetId)
1155
1156            # If a manifest exists, get the annotations for it, else return base 'manifest' tuple
1157            if manifestId:
1158                annotations = self.getFileAnnotations(manifestId)
1159
1160                # If manifest has annotations specifying component, use that
1161                if annotations and "Component" in annotations:
1162                    component = annotations["Component"]
1163                    entity = self.synapse_entity_tracker.get(
1164                        synapse_id=manifestId, syn=self.syn, download_file=False
1165                    )
1166                    manifest_name = entity["properties"]["name"]
1167
1168                # otherwise download the manifest and parse for information
1169                elif not annotations or "Component" not in annotations:
1170                    logging.debug(
1171                        f"No component annotations have been found for manifest {manifestId}. "
1172                        "The manifest will be downloaded and parsed instead. "
1173                        "For increased speed, add component annotations to manifest."
1174                    )
1175
1176                    manifest_info = self.getDatasetManifest(
1177                        datasetId, downloadFile=True
1178                    )
1179                    manifest_name = manifest_info["properties"].get("name", "")
1180
1181                    if not manifest_name:
1182                        logger.error(f"Failed to download manifests from {datasetId}")
1183
1184                    manifest_path = manifest_info["path"]
1185
1186                    manifest_df = load_df(manifest_path)
1187
1188                    # Get component from component column if it exists
1189                    if (
1190                        "Component" in manifest_df
1191                        and not manifest_df["Component"].empty
1192                    ):
1193                        list(set(manifest_df["Component"]))
1194                        component = list(set(manifest_df["Component"]))
1195
1196                        # Added to address issues raised during DCA testing
1197                        if "" in component:
1198                            component.remove("")
1199
1200                        if len(component) == 1:
1201                            component = component[0]
1202                        elif len(component) > 1:
1203                            logging.warning(
1204                                f"Manifest {manifestId} is composed of multiple components. Schematic does not support mulit-component manifests at this time."
1205                                "Behavior of manifests with multiple components is undefined"
1206                            )
1207            else:
1208                manifest_name = ""
1209                component = None
1210            if component:
1211                manifest = (
1212                    (datasetId, datasetName),
1213                    (manifestId, manifest_name),
1214                    (component, component),
1215                )
1216            elif manifestId:
1217                logging.debug(
1218                    f"Manifest {manifestId} does not have an associated Component"
1219                )
1220                manifest = (
1221                    (datasetId, datasetName),
1222                    (manifestId, manifest_name),
1223                    ("", ""),
1224                )
1225            else:
1226                manifest = (
1227                    (datasetId, datasetName),
1228                    ("", ""),
1229                    ("", ""),
1230                )
1231
1232            if manifest:
1233                manifests.append(manifest)
1234
1235        return manifests
1236
1237    def upload_project_manifests_to_synapse(
1238        self, dmge: DataModelGraphExplorer, projectId: str
1239    ) -> List[str]:
1240        """Upload all metadata manifest files across all datasets in a specified project as tables in Synapse.
1241
1242        Returns: String of all the manifest_table_ids of all the manifests that have been loaded.
1243        """
1244
1245        manifests = []
1246        manifest_loaded = []
1247        datasets = self.getStorageDatasetsInProject(projectId)
1248
1249        for datasetId, datasetName in datasets:
1250            # encode information about the manifest in a simple list (so that R clients can unpack it)
1251            # eventually can serialize differently
1252
1253            manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1254
1255            manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1256            if manifest_info:
1257                manifest_id = manifest_info["properties"]["id"]
1258                manifest_name = manifest_info["properties"]["name"]
1259                manifest_path = manifest_info["path"]
1260                manifest_df = load_df(manifest_path)
1261                manifest_table_id = uploadDB(
1262                    dmge=dmge,
1263                    manifest=manifest,
1264                    datasetId=datasetId,
1265                    table_name=datasetName,
1266                )
1267                manifest_loaded.append(datasetName)
1268        return manifest_loaded
1269
1270    def upload_annotated_project_manifests_to_synapse(
1271        self, projectId: str, path_to_json_ld: str, dry_run: bool = False
1272    ) -> List[str]:
1273        """
1274        Purpose:
1275            For all manifests in a project, upload them as a table and add annotations manifest csv.
1276            Assumes the manifest is already present as a CSV in a dataset in the project.
1277
1278        """
1279        # Instantiate DataModelParser
1280        data_model_parser = DataModelParser(path_to_data_model=path_to_json_ld)
1281        # Parse Model
1282        parsed_data_model = data_model_parser.parse_model()
1283
1284        # Instantiate DataModelGraph
1285        data_model_grapher = DataModelGraph(parsed_data_model)
1286
1287        # Generate graph
1288        graph_data_model = data_model_grapher.generate_data_model_graph()
1289
1290        # Instantiate DataModelGraphExplorer
1291        dmge = DataModelGraphExplorer(graph_data_model)
1292
1293        manifests = []
1294        manifest_loaded = []
1295        datasets = self.getStorageDatasetsInProject(projectId)
1296        for datasetId, datasetName in datasets:
1297            # encode information about the manifest in a simple list (so that R clients can unpack it)
1298            # eventually can serialize differently
1299
1300            manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1301            manifests.append(manifest)
1302
1303            manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1304
1305            if manifest_info:
1306                manifest_id = manifest_info["properties"]["id"]
1307                manifest_name = manifest_info["properties"]["name"]
1308                manifest_path = manifest_info["path"]
1309                manifest = (
1310                    (datasetId, datasetName),
1311                    (manifest_id, manifest_name),
1312                    ("", ""),
1313                )
1314                if not dry_run:
1315                    self.associateMetadataWithFiles(
1316                        dmge, manifest_path, datasetId, manifest_record_type="table"
1317                    )
1318                manifest_loaded.append(manifest)
1319
1320        return manifests, manifest_loaded
1321
1322    def move_entities_to_new_project(
1323        self,
1324        projectId: str,
1325        newProjectId: str,
1326        returnEntities: bool = False,
1327        dry_run: bool = False,
1328    ):
1329        """
1330        For each manifest csv in a project, look for all the entitiy ids that are associated.
1331        Look up the entitiy in the files, move the entity to new project.
1332        """
1333
1334        manifests = []
1335        manifest_loaded = []
1336        datasets = self.getStorageDatasetsInProject(projectId)
1337        if datasets:
1338            for datasetId, datasetName in datasets:
1339                # encode information about the manifest in a simple list (so that R clients can unpack it)
1340                # eventually can serialize differently
1341
1342                manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1343                manifests.append(manifest)
1344
1345                manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1346                if manifest_info:
1347                    manifest_id = manifest_info["properties"]["id"]
1348                    manifest_name = manifest_info["properties"]["name"]
1349                    manifest_path = manifest_info["path"]
1350                    manifest_df = load_df(manifest_path)
1351
1352                    manifest = (
1353                        (datasetId, datasetName),
1354                        (manifest_id, manifest_name),
1355                        ("", ""),
1356                    )
1357                    manifest_loaded.append(manifest)
1358
1359                    annotation_entities = self.storageFileviewTable[
1360                        (self.storageFileviewTable["id"].isin(manifest_df["entityId"]))
1361                        & (self.storageFileviewTable["type"] == "folder")
1362                    ]["id"]
1363
1364                    if returnEntities:
1365                        for entityId in annotation_entities:
1366                            if not dry_run:
1367                                moved_entity = self.syn.move(entityId, datasetId)
1368                                self.synapse_entity_tracker.add(
1369                                    synapse_id=moved_entity.id, entity=moved_entity
1370                                )
1371                            else:
1372                                logging.info(
1373                                    f"{entityId} will be moved to folder {datasetId}."
1374                                )
1375                    else:
1376                        # generate project folder
1377                        archive_project_folder = Folder(
1378                            projectId + "_archive", parent=newProjectId
1379                        )
1380                        archive_project_folder = self.syn.store(archive_project_folder)
1381                        self.synapse_entity_tracker.add(
1382                            synapse_id=archive_project_folder.id,
1383                            entity=archive_project_folder,
1384                        )
1385
1386                        # generate dataset folder
1387                        dataset_archive_folder = Folder(
1388                            "_".join([datasetId, datasetName, "archive"]),
1389                            parent=archive_project_folder.id,
1390                        )
1391                        dataset_archive_folder = self.syn.store(dataset_archive_folder)
1392                        self.synapse_entity_tracker.add(
1393                            synapse_id=dataset_archive_folder.id,
1394                            entity=dataset_archive_folder,
1395                        )
1396
1397                        for entityId in annotation_entities:
1398                            # move entities to folder
1399                            if not dry_run:
1400                                moved_entity = self.syn.move(
1401                                    entityId, dataset_archive_folder.id
1402                                )
1403                                self.synapse_entity_tracker.add(
1404                                    synapse_id=moved_entity.id, entity=moved_entity
1405                                )
1406                            else:
1407                                logging.info(
1408                                    f"{entityId} will be moved to folder {dataset_archive_folder.id}."
1409                                )
1410        else:
1411            raise LookupError(
1412                f"No datasets were found in the specified project: {projectId}. Re-check specified master_fileview in CONFIG and retry."
1413            )
1414        return manifests, manifest_loaded
1415
1416    @tracer.start_as_current_span("SynapseStorage::get_synapse_table")
1417    def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable]:
1418        """Download synapse table as a pd dataframe; return table schema and etags as results too
1419
1420        Args:
1421            synapse_id: synapse ID of the table to query
1422        """
1423
1424        results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id))
1425        df = results.asDataFrame(
1426            rowIdAndVersionInIndex=False,
1427            na_values=STR_NA_VALUES_FILTERED,
1428            keep_default_na=False,
1429        )
1430
1431        return df, results
1432
1433    @missing_entity_handler
1434    @tracer.start_as_current_span("SynapseStorage::uploadDB")
1435    def uploadDB(
1436        self,
1437        dmge: DataModelGraphExplorer,
1438        manifest: pd.DataFrame,
1439        datasetId: str,
1440        table_name: str,
1441        restrict: bool = False,
1442        table_manipulation: str = "replace",
1443        table_column_names: str = "class_label",
1444    ):
1445        """
1446        Method to upload a database to an asset store. In synapse, this will upload a metadata table
1447
1448        Args:
1449            dmge: DataModelGraphExplorer object
1450            manifest: pd.Df manifest to upload
1451            datasetId: synID of the dataset for the manifest
1452            table_name: name of the table to be uploaded
1453            restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
1454            existingTableId: str of the synId of the existing table, if one already exists
1455            table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
1456            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
1457                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
1458                display label formatting.
1459        Returns:
1460            manifest_table_id: synID of the uploaded table
1461            manifest: the original manifset
1462            table_manifest: manifest formatted appropriately for the table
1463
1464        """
1465
1466        col_schema, table_manifest = self.formatDB(
1467            dmge=dmge, manifest=manifest, table_column_names=table_column_names
1468        )
1469
1470        manifest_table_id = self.buildDB(
1471            datasetId,
1472            table_name,
1473            col_schema,
1474            table_manifest,
1475            table_manipulation,
1476            dmge,
1477            restrict,
1478        )
1479
1480        return manifest_table_id, manifest, table_manifest
1481
1482    @tracer.start_as_current_span("SynapseStorage::formatDB")
1483    def formatDB(self, dmge, manifest, table_column_names):
1484        """
1485        Method to format a manifest appropriatly for upload as table
1486
1487        Args:
1488            dmge: DataModelGraphExplorer object
1489            manifest: pd.Df manifest to upload
1490            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
1491                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
1492                display label formatting.
1493        Returns:
1494            col_schema: schema for table columns: type, size, etc
1495            table_manifest: formatted manifest
1496
1497        """
1498        # Rename the manifest columns to display names to match fileview
1499
1500        blacklist_chars = ["(", ")", ".", " ", "-"]
1501        manifest_columns = manifest.columns.tolist()
1502
1503        table_manifest = deepcopy(manifest)
1504
1505        if table_column_names == "display_name":
1506            cols = table_manifest.columns
1507
1508        elif table_column_names == "display_label":
1509            cols = [
1510                str(col).translate({ord(x): "" for x in blacklist_chars})
1511                for col in manifest_columns
1512            ]
1513
1514        elif table_column_names == "class_label":
1515            cols = [
1516                get_class_label_from_display_name(str(col)).translate(
1517                    {ord(x): "" for x in blacklist_chars}
1518                )
1519                for col in manifest_columns
1520            ]
1521        else:
1522            ValueError(
1523                f"The provided table_column_name: {table_column_names} is not valid, please resubmit with an allowed value only."
1524            )
1525
1526        cols = list(map(lambda x: x.replace("EntityId", "entityId"), cols))
1527
1528        # Reset column names in table manifest
1529        table_manifest.columns = cols
1530
1531        # move entity id to end of df
1532        entity_col = table_manifest.pop("entityId")
1533        table_manifest.insert(len(table_manifest.columns), "entityId", entity_col)
1534
1535        # Get the column schema
1536        col_schema = as_table_columns(table_manifest)
1537
1538        # Set Id column length to 64 (for some reason not being auto set.)
1539        for i, col in enumerate(col_schema):
1540            if col["name"].lower() == "id":
1541                col_schema[i]["maximumSize"] = 64
1542
1543        return col_schema, table_manifest
1544
1545    @tracer.start_as_current_span("SynapseStorage::buildDB")
1546    def buildDB(
1547        self,
1548        datasetId: str,
1549        table_name: str,
1550        col_schema: List,
1551        table_manifest: pd.DataFrame,
1552        table_manipulation: str,
1553        dmge: DataModelGraphExplorer,
1554        restrict: bool = False,
1555    ):
1556        """
1557        Method to construct the table appropriately: create new table, replace existing, or upsert new into existing
1558        Calls TableOperations class to execute
1559
1560        Args:
1561            datasetId: synID of the dataset for the manifest
1562            table_name: name of the table to be uploaded
1563            col_schema: schema for table columns: type, size, etc from `formatDB`
1564            table_manifest: formatted manifest that can be uploaded as a table
1565            table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
1566            restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
1567
1568        Returns:
1569            manifest_table_id: synID of the uploaded table
1570
1571        """
1572        table_parent_id = self.getDatasetProject(datasetId=datasetId)
1573        existing_table_id = self.syn.findEntityId(
1574            name=table_name, parent=table_parent_id
1575        )
1576        tableOps = TableOperations(
1577            synStore=self,
1578            tableToLoad=table_manifest,
1579            tableName=table_name,
1580            datasetId=datasetId,
1581            existingTableId=existing_table_id,
1582            restrict=restrict,
1583            synapse_entity_tracker=self.synapse_entity_tracker,
1584        )
1585
1586        if not table_manipulation or existing_table_id is None:
1587            manifest_table_id = tableOps.createTable(
1588                columnTypeDict=col_schema,
1589                specifySchema=True,
1590            )
1591        elif existing_table_id is not None:
1592            if table_manipulation.lower() == "replace":
1593                manifest_table_id = tableOps.replaceTable(
1594                    specifySchema=True,
1595                    columnTypeDict=col_schema,
1596                )
1597            elif table_manipulation.lower() == "upsert":
1598                manifest_table_id = tableOps.upsertTable(
1599                    dmge=dmge,
1600                )
1601            elif table_manipulation.lower() == "update":
1602                manifest_table_id = tableOps.updateTable()
1603
1604        if table_manipulation and table_manipulation.lower() == "upsert":
1605            table_entity = self.synapse_entity_tracker.get(
1606                synapse_id=existing_table_id or manifest_table_id,
1607                syn=self.syn,
1608                download_file=False,
1609            )
1610            annos = OldAnnotations(
1611                id=table_entity.id,
1612                etag=table_entity.etag,
1613                values=table_entity.annotations,
1614            )
1615            annos["primary_key"] = table_manifest["Component"][0] + "_id"
1616            annos = self.syn.set_annotations(annos)
1617            table_entity.etag = annos.etag
1618            table_entity.annotations = annos
1619
1620        return manifest_table_id
1621
1622    @tracer.start_as_current_span("SynapseStorage::upload_manifest_file")
1623    def upload_manifest_file(
1624        self,
1625        manifest,
1626        metadataManifestPath,
1627        datasetId,
1628        restrict_manifest,
1629        component_name="",
1630    ):
1631        # Update manifest to have the new entityId column
1632        manifest.to_csv(metadataManifestPath, index=False)
1633
1634        # store manifest to Synapse as a CSV
1635        # update file name
1636        file_name_full = metadataManifestPath.split("/")[-1]
1637        file_extension = file_name_full.split(".")[-1]
1638
1639        # Differentiate "censored" and "uncensored" manifest
1640        if "censored" in file_name_full:
1641            file_name_new = (
1642                os.path.basename(CONFIG.synapse_manifest_basename)
1643                + "_"
1644                + component_name
1645                + "_censored"
1646                + "."
1647                + file_extension
1648            )
1649        else:
1650            file_name_new = (
1651                os.path.basename(CONFIG.synapse_manifest_basename)
1652                + "_"
1653                + component_name
1654                + "."
1655                + file_extension
1656            )
1657
1658        manifest_synapse_file = None
1659        try:
1660            # Rename the file to file_name_new then revert
1661            # This is to maintain the original file name in-case other code is
1662            # expecting that the file exists with the original name
1663            original_file_path = metadataManifestPath
1664            new_file_path = os.path.join(
1665                os.path.dirname(metadataManifestPath), file_name_new
1666            )
1667            os.rename(original_file_path, new_file_path)
1668
1669            manifest_synapse_file = self._store_file_for_manifest_upload(
1670                new_file_path=new_file_path,
1671                dataset_id=datasetId,
1672                existing_file_name=file_name_full,
1673                file_name_new=file_name_new,
1674                restrict_manifest=restrict_manifest,
1675            )
1676            manifest_synapse_file_id = manifest_synapse_file.id
1677
1678        finally:
1679            # Revert the file name back to the original
1680            os.rename(new_file_path, original_file_path)
1681
1682            if manifest_synapse_file:
1683                manifest_synapse_file.path = original_file_path
1684
1685        return manifest_synapse_file_id
1686
1687    def _store_file_for_manifest_upload(
1688        self,
1689        new_file_path: str,
1690        dataset_id: str,
1691        existing_file_name: str,
1692        file_name_new: str,
1693        restrict_manifest: bool,
1694    ) -> File:
1695        """Handles a create or update of a manifest file that is going to be uploaded.
1696        If we already have a copy of the Entity in memory we will update that instance,
1697        otherwise create a new File instance to be created in Synapse. Once stored
1698        this will add the file to the `synapse_entity_tracker` for future reference.
1699
1700        Args:
1701            new_file_path (str): The path to the new manifest file
1702            dataset_id (str): The Synapse ID of the dataset the manifest is associated with
1703            existing_file_name (str): The name of the existing file
1704            file_name_new (str): The name of the new file
1705            restrict_manifest (bool): Whether the manifest should be restricted
1706
1707        Returns:
1708            File: The stored manifest file
1709        """
1710        local_tracked_file_instance = (
1711            self.synapse_entity_tracker.search_local_by_parent_and_name(
1712                name=existing_file_name, parent_id=dataset_id
1713            )
1714            or self.synapse_entity_tracker.search_local_by_parent_and_name(
1715                name=file_name_new, parent_id=dataset_id
1716            )
1717        )
1718
1719        if local_tracked_file_instance:
1720            local_tracked_file_instance.path = new_file_path
1721            local_tracked_file_instance.description = (
1722                "Manifest for dataset " + dataset_id
1723            )
1724            manifest_synapse_file = local_tracked_file_instance
1725        else:
1726            manifest_synapse_file = File(
1727                path=new_file_path,
1728                description="Manifest for dataset " + dataset_id,
1729                parent=dataset_id,
1730                name=file_name_new,
1731            )
1732
1733        manifest_synapse_file = self.syn.store(
1734            manifest_synapse_file, isRestricted=restrict_manifest
1735        )
1736
1737        self.synapse_entity_tracker.add(
1738            synapse_id=manifest_synapse_file.id, entity=manifest_synapse_file
1739        )
1740        return manifest_synapse_file
1741
1742    async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]:
1743        """get annotations asynchronously
1744
1745        Args:
1746            synapse_id (str): synapse id of the entity that the annotation belongs
1747
1748        Returns:
1749            Dict[str, Any]: The requested entity bundle matching
1750            <https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html>
1751        """
1752        return await get_entity_id_bundle2(
1753            entity_id=synapse_id,
1754            request={"includeAnnotations": True},
1755            synapse_client=self.syn,
1756        )
1757
1758    async def store_async_annotation(self, annotation_dict: dict) -> Annotations:
1759        """store annotation in an async way
1760
1761        Args:
1762            annotation_dict (dict): annotation in a dictionary format
1763
1764        Returns:
1765            Annotations: The stored annotations.
1766        """
1767        annotation_data = Annotations.from_dict(
1768            synapse_annotations=annotation_dict["annotations"]["annotations"]
1769        )
1770        annotation_class = Annotations(
1771            annotations=annotation_data,
1772            etag=annotation_dict["annotations"]["etag"],
1773            id=annotation_dict["annotations"]["id"],
1774        )
1775        annotation_storage_result = await annotation_class.store_async(
1776            synapse_client=self.syn
1777        )
1778        local_entity = self.synapse_entity_tracker.get(
1779            synapse_id=annotation_dict["annotations"]["id"],
1780            syn=self.syn,
1781            download_file=False,
1782            retrieve_if_not_present=False,
1783        )
1784        if local_entity:
1785            local_entity.etag = annotation_storage_result.etag
1786            local_entity.annotations = annotation_storage_result
1787        return annotation_storage_result
1788
1789    def process_row_annotations(
1790        self,
1791        dmge: DataModelGraphExplorer,
1792        metadata_syn: Dict[str, Any],
1793        hide_blanks: bool,
1794        csv_list_regex: str,
1795        annos: Dict[str, Any],
1796        annotation_keys: str,
1797    ) -> Dict[str, Any]:
1798        """Processes metadata annotations based on the logic below:
1799        1. Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is:
1800            An empty or whitespace-only string.
1801            A NaN value (if the annotation is a float).
1802        if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key.
1803        if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key.
1804
1805        2. If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name".
1806        Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key.
1807
1808        3. For any other conditions, assigns the original value of anno_v to the annotation key (anno_k).
1809
1810        4. Returns the updated annotations dictionary.
1811
1812        Args:
1813            dmge (DataModelGraphExplorer): data model graph explorer
1814            metadata_syn (dict): metadata used for Synapse storage
1815            hideBlanks (bool): if true, does not upload annotation keys with blank values.
1816            csv_list_regex (str): Regex to match with comma separated list
1817            annos (Dict[str, Any]): dictionary of annotation returned from synapse
1818            annotation_keys (str): display_label/class_label
1819
1820        Returns:
1821            Dict[str, Any]: annotations as a dictionary
1822
1823        ```mermaid
1824        flowchart TD
1825            A[Start] --> C{Is anno_v empty, whitespace, or NaN?}
1826            C -- Yes --> D{Is hide_blanks True?}
1827            D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing]
1828            D -- No --> F[Assign empty string to annotation key]
1829            C -- No --> G{Is anno_v a string?}
1830            G -- No --> H[Assign original value of anno_v to annotation key]
1831            G -- Yes --> I{Does anno_v match csv_list_regex?}
1832            I -- Yes --> J[Get validation rule of anno_k]
1833            J --> K{Does the validation rule contain 'list'}
1834            K -- Yes --> L[Split anno_v by commas and assign as list]
1835            I -- No --> H
1836            K -- No --> H
1837        ```
1838        """
1839        for anno_k, anno_v in metadata_syn.items():
1840            # Remove keys with nan or empty string values or string that only contains white space from dict of annotations to be uploaded
1841            # if present on current data annotation
1842            if hide_blanks and (
1843                (isinstance(anno_v, str) and anno_v.strip() == "")
1844                or (isinstance(anno_v, float) and np.isnan(anno_v))
1845            ):
1846                annos["annotations"]["annotations"].pop(anno_k) if anno_k in annos[
1847                    "annotations"
1848                ]["annotations"].keys() else annos["annotations"]["annotations"]
1849                continue
1850
1851            # Otherwise save annotation as approrpriate
1852            if isinstance(anno_v, float) and np.isnan(anno_v):
1853                annos["annotations"]["annotations"][anno_k] = ""
1854                continue
1855
1856            # Handle strings that match the csv_list_regex and pass the validation rule
1857            if isinstance(anno_v, str) and re.fullmatch(csv_list_regex, anno_v):
1858                # Use a dictionary to dynamically choose the argument
1859                param = (
1860                    {"node_display_name": anno_k}
1861                    if annotation_keys == "display_label"
1862                    else {"node_label": anno_k}
1863                )
1864                node_validation_rules = dmge.get_node_validation_rules(**param)
1865
1866                if rule_in_rule_list("list", node_validation_rules):
1867                    annos["annotations"]["annotations"][anno_k] = anno_v.split(",")
1868                    continue
1869            # default: assign the original value
1870            annos["annotations"]["annotations"][anno_k] = anno_v
1871
1872        return annos
1873
1874    @async_missing_entity_handler
1875    async def format_row_annotations(
1876        self,
1877        dmge: DataModelGraphExplorer,
1878        row: pd.Series,
1879        entityId: str,
1880        hideBlanks: bool,
1881        annotation_keys: str,
1882    ) -> Union[None, Dict[str, Any]]:
1883        """Format row annotations
1884
1885        Args:
1886            dmge (DataModelGraphExplorer): data moodel graph explorer object
1887            row (pd.Series): row of the manifest
1888            entityId (str): entity id of the manifest
1889            hideBlanks (bool): when true, does not upload annotation keys with blank values. When false, upload Annotation keys with empty string values
1890            annotation_keys (str): display_label/class_label
1891
1892        Returns:
1893            Union[None, Dict[str,]]: if entity id is in trash can, return None. Otherwise, return the annotations
1894        """
1895        # prepare metadata for Synapse storage (resolve display name into a name that Synapse annotations support (e.g no spaces, parenthesis)
1896        # note: the removal of special characters, will apply only to annotation keys; we are not altering the manifest
1897        # this could create a divergence between manifest column and annotations. this should be ok for most use cases.
1898        # columns with special characters are outside of the schema
1899        metadataSyn = {}
1900        blacklist_chars = ["(", ")", ".", " ", "-"]
1901
1902        for k, v in row.to_dict().items():
1903            if annotation_keys == "display_label":
1904                keySyn = str(k).translate({ord(x): "" for x in blacklist_chars})
1905            elif annotation_keys == "class_label":
1906                keySyn = get_class_label_from_display_name(str(k)).translate(
1907                    {ord(x): "" for x in blacklist_chars}
1908                )
1909
1910            # Skip `Filename` and `ETag` columns when setting annotations
1911            if keySyn in ["Filename", "ETag", "eTag"]:
1912                continue
1913
1914            # truncate annotation values to 500 characters if the
1915            # size of values is greater than equal to 500 characters
1916            # add an explicit [truncatedByDataCuratorApp] message at the end
1917            # of every truncated message to indicate that the cell value
1918            # has been truncated
1919            if isinstance(v, str) and len(v) >= 500:
1920                v = v[0:472] + "[truncatedByDataCuratorApp]"
1921
1922            metadataSyn[keySyn] = v
1923
1924        # This will first check if the entity is already in memory, and if so, that
1925        # instance is used. Unfortunately, the expected return format needs to match
1926        # the Synapse API, so we need to convert the annotations to the expected format.
1927        entity = self.synapse_entity_tracker.get(
1928            synapse_id=entityId,
1929            syn=self.syn,
1930            download_file=False,
1931            retrieve_if_not_present=False,
1932        )
1933        if entity is not None:
1934            synapse_annotations = _convert_to_annotations_list(
1935                annotations=entity.annotations
1936            )
1937            annos = {
1938                "annotations": {
1939                    "id": entity.id,
1940                    "etag": entity.etag,
1941                    "annotations": synapse_annotations,
1942                }
1943            }
1944        else:
1945            annos = await self.get_async_annotation(entityId)
1946
1947        # set annotation(s) for the various objects/items in a dataset on Synapse
1948        csv_list_regex = comma_separated_list_regex()
1949
1950        annos = self.process_row_annotations(
1951            dmge=dmge,
1952            metadata_syn=metadataSyn,
1953            hide_blanks=hideBlanks,
1954            csv_list_regex=csv_list_regex,
1955            annos=annos,
1956            annotation_keys=annotation_keys,
1957        )
1958
1959        return annos
1960
1961    @missing_entity_handler
1962    @tracer.start_as_current_span("SynapseStorage::format_manifest_annotations")
1963    def format_manifest_annotations(self, manifest, manifest_synapse_id):
1964        """
1965        Set annotations for the manifest (as a whole) so they can be applied to the manifest table or csv.
1966        For now just getting the Component.
1967        """
1968
1969        entity = self.synapse_entity_tracker.get(
1970            synapse_id=manifest_synapse_id, syn=self.syn, download_file=False
1971        )
1972        is_file = entity.concreteType.endswith(".FileEntity")
1973        is_table = entity.concreteType.endswith(".TableEntity")
1974
1975        if is_file:
1976            # Get file metadata
1977            metadata = self.getFileAnnotations(manifest_synapse_id)
1978
1979            # If there is a defined component add it to the metadata.
1980            if "Component" in manifest.columns:
1981                # Gather component information
1982                component = manifest["Component"].unique()
1983
1984                # Double check that only a single component is listed, else raise an error.
1985                try:
1986                    len(component) == 1
1987                except ValueError as err:
1988                    raise ValueError(
1989                        f"Manifest has more than one component. Please check manifest and resubmit."
1990                    ) from err
1991
1992                # Add component to metadata
1993                metadata["Component"] = component[0]
1994
1995        elif is_table:
1996            # Get table metadata
1997            metadata = self.getTableAnnotations(manifest_synapse_id)
1998
1999        # Get annotations
2000        annos = OldAnnotations(
2001            id=entity.id, etag=entity.etag, values=entity.annotations
2002        )
2003
2004        # Add metadata to the annotations
2005        for annos_k, annos_v in metadata.items():
2006            annos[annos_k] = annos_v
2007        return annos
2008
2009    '''
2010    def annotate_upload_manifest_table(self, manifest, datasetId, metadataManifestPath,
2011        useSchemaLabel: bool = True, hideBlanks: bool = False, restrict_manifest = False):
2012        """
2013        Purpose:
2014            Works very similarly to associateMetadataWithFiles except takes in the manifest
2015            rather than the manifest path
2016
2017        """
2018
2019        # Add uuid for table updates and fill.
2020        if not "Uuid" in manifest.columns:
2021            manifest["Uuid"] = ''
2022
2023        for idx,row in manifest.iterrows():
2024            if not row["Uuid"]:
2025                gen_uuid = uuid.uuid4()
2026                row["Uuid"] = gen_uuid
2027                manifest.loc[idx, 'Uuid'] = gen_uuid
2028
2029        # add entityId as a column if not already there or
2030        # fill any blanks with an empty string.
2031        if not "entityId" in manifest.columns:
2032            manifest["entityId"] = ""
2033        else:
2034            manifest["entityId"].fillna("", inplace=True)
2035
2036        # get a DataModelGraphExplorer object to ensure schema attribute names used in manifest are translated to schema labels for synapse annotations
2037        dmge = DataModelGraphExplorer()
2038
2039        # Create table name here.
2040        if 'Component' in manifest.columns:
2041            table_name = manifest['Component'][0].lower() + '_synapse_storage_manifest_table'
2042        else:
2043            table_name = 'synapse_storage_manifest_table'
2044
2045        # Upload manifest as a table and get the SynID and manifest
2046        manifest_synapse_table_id, manifest, table_manifest = self.upload_format_manifest_table(
2047                                                    dmge, manifest, datasetId, table_name, restrict = restrict_manifest, useSchemaLabel=useSchemaLabel,)
2048
2049        # Iterate over manifest rows, create Synapse entities and store corresponding entity IDs in manifest if needed
2050        # also set metadata for each synapse entity as Synapse annotations
2051        for idx, row in manifest.iterrows():
2052            if not row["entityId"]:
2053                # If not using entityIds, fill with manifest_table_id so
2054                row["entityId"] = manifest_synapse_table_id
2055                entityId = ''
2056            else:
2057                # get the entity id corresponding to this row
2058                entityId = row["entityId"]
2059
2060        # Load manifest to synapse as a CSV File
2061        manifest_synapse_file_id = self.upload_manifest_file(manifest, metadataManifestPath, datasetId, restrict_manifest)
2062
2063        # Get annotations for the file manifest.
2064        manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_file_id)
2065
2066        self.syn.set_annotations(manifest_annotations)
2067
2068        logger.info("Associated manifest file with dataset on Synapse.")
2069
2070        # Update manifest Synapse table with new entity id column.
2071        self.make_synapse_table(
2072            table_to_load = table_manifest,
2073            dataset_id = datasetId,
2074            existingTableId = manifest_synapse_table_id,
2075            table_name = table_name,
2076            update_col = 'Uuid',
2077            specify_schema = False,
2078            )
2079
2080        # Get annotations for the table manifest
2081        manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_table_id)
2082        self.syn.set_annotations(manifest_annotations)
2083        return manifest_synapse_table_id
2084    '''
2085
2086    def _read_manifest(self, metadataManifestPath: str) -> pd.DataFrame:
2087        """Helper function to read in provided manifest as a pandas DataFrame for subsequent downstream processing.
2088        Args:
2089            metadataManifestPath (str): path where manifest is stored
2090        Returns:
2091            manifest(pd.DataFrame): Manifest loaded as a pandas dataframe
2092        Raises:
2093            FileNotFoundError: Manifest file does not exist at provided path.
2094        """
2095        # read new manifest csv
2096        try:
2097            load_args = {
2098                "dtype": "string",
2099            }
2100            manifest = load_df(
2101                metadataManifestPath,
2102                preserve_raw_input=False,
2103                allow_na_values=False,
2104                **load_args,
2105            )
2106        except FileNotFoundError as err:
2107            raise FileNotFoundError(
2108                f"No manifest file was found at this path: {metadataManifestPath}"
2109            ) from err
2110        return manifest
2111
2112    def _add_id_columns_to_manifest(
2113        self, manifest: pd.DataFrame, dmge: DataModelGraphExplorer
2114    ) -> pd.DataFrame:
2115        """
2116        Ensures that the manifest DataFrame has standardized 'Id' and 'entityId' columns.
2117
2118        - If any case variation of the 'id' column is present (e.g., 'id', 'ID', 'iD'), it is renamed to 'Id'.
2119        - If any case variation of the 'entityid' column is present, it is renamed to 'entityId'.
2120        - If any case variation of the 'uuid' column is present, it is renamed to 'uuid' before further processing.
2121        - If 'Id' is still missing:
2122            - It will be created as an empty column, or
2123            - Derived from a 'Uuid' column, depending on whether 'uuid' is defined in the schema.
2124        - If both 'uuid' and 'Id' columns exist, the 'uuid' column is dropped.
2125        - Missing values in the 'Id' column are filled with generated UUIDs.
2126        - If 'entityId' is still missing, it will be created and filled with empty strings.
2127        - If 'entityId' is already present, any missing values will be replaced with empty strings.
2128
2129        Args:
2130            manifest (pd.DataFrame): The metadata manifest to be updated.
2131            dmge (DataModelGraphExplorer): Data model graph explorer object.
2132
2133        Returns:
2134            pd.DataFrame: The updated manifest with a standardized 'Id' column and an 'entityId' column.
2135        """
2136
2137        # Normalize any variation of 'id' to 'Id', "entityid" to "entityId", "Uuid" to "uuid"
2138        for col in manifest.columns:
2139            if col.lower() == "id":
2140                manifest = manifest.rename(columns={col: ID_COLUMN})
2141            if col.lower() == "entityid":
2142                manifest = manifest.rename(columns={col: ENTITY_ID_COLUMN})
2143            if col.lower() == "uuid":
2144                manifest = manifest.rename(columns={col: UUID_COLUMN})
2145
2146        # If 'Id' still doesn't exist, see if uuid column exists
2147        # Rename uuid column to "Id" column
2148        if ID_COLUMN not in manifest.columns:
2149            # See if schema has `Uuid` column specified
2150            try:
2151                uuid_col_in_schema = dmge.is_class_in_schema(
2152                    "Uuid"
2153                ) or dmge.is_class_in_schema("uuid")
2154            except KeyError:
2155                uuid_col_in_schema = False
2156
2157            # Rename `uuid` column if it wasn't specified in the schema
2158            if UUID_COLUMN in manifest.columns and not uuid_col_in_schema:
2159                manifest = manifest.rename(columns={UUID_COLUMN: ID_COLUMN})
2160            # If no `uuid` column exists or it is specified in the schema, create a new `Id` column
2161            else:
2162                manifest[ID_COLUMN] = ""
2163        else:
2164            # 'Id' already exists, ignore 'uuid'
2165            if UUID_COLUMN in manifest.columns:
2166                manifest = manifest.drop(columns=[UUID_COLUMN])
2167
2168        # Fill in UUIDs in the "Id" column if missing
2169        for idx, row in manifest.iterrows():
2170            if not row["Id"]:
2171                gen_uuid = str(uuid.uuid4())
2172                row["Id"] = gen_uuid
2173                manifest.loc[idx, ID_COLUMN] = gen_uuid
2174
2175        # Add entityId as a column if not already there
2176        if ENTITY_ID_COLUMN not in manifest:
2177            manifest[ENTITY_ID_COLUMN] = ""
2178        else:
2179            manifest[ENTITY_ID_COLUMN] = manifest[ENTITY_ID_COLUMN].fillna("")
2180
2181        return manifest
2182
2183    def _generate_table_name(self, manifest):
2184        """Helper function to generate a table name for upload to synapse.
2185
2186        Args:
2187            Manifest loaded as a pd.Dataframe
2188
2189        Returns:
2190            table_name (str): Name of the table to load
2191            component_name (str): Name of the manifest component (if applicable)
2192        """
2193        # Create table name here.
2194        if "Component" in manifest.columns:
2195            component_name = manifest["Component"][0].lower()
2196            table_name = component_name + "_synapse_storage_manifest_table"
2197        else:
2198            component_name = ""
2199            table_name = "synapse_storage_manifest_table"
2200        return table_name, component_name
2201
2202    def _create_entity_id(self, idx, row, manifest, datasetId):
2203        """Helper function to generate an entityId and add it to the appropriate row in the manifest.
2204        Args:
2205            row: current row of manifest being processed
2206            manifest (pd.DataFrame): loaded df containing user supplied data.
2207            datasetId (str): synapse ID of folder containing the dataset
2208
2209        Returns:
2210            manifest (pd.DataFrame): manifest with entityId added to the appropriate row
2211            entityId (str): Generated Entity Id.
2212
2213        """
2214        rowEntity = Folder(str(uuid.uuid4()), parent=datasetId)
2215        rowEntity = self.syn.store(rowEntity)
2216        entityId = rowEntity["id"]
2217        self.synapse_entity_tracker.add(synapse_id=entityId, entity=rowEntity)
2218        row["entityId"] = entityId
2219        manifest.loc[idx, "entityId"] = entityId
2220        return manifest, entityId
2221
2222    async def _process_store_annos(self, requests: Set[asyncio.Task]) -> None:
2223        """Process annotations and store them on synapse asynchronously
2224
2225        Args:
2226            requests (Set[asyncio.Task]): a set of tasks of formatting annotations created by format_row_annotations function in previous step
2227
2228        Raises:
2229            RuntimeError: raise a run time error if a task failed to complete
2230        """
2231        while requests:
2232            done_tasks, pending_tasks = await asyncio.wait(
2233                requests, return_when=asyncio.FIRST_COMPLETED
2234            )
2235            requests = pending_tasks
2236
2237            for completed_task in done_tasks:
2238                try:
2239                    annos = completed_task.result()
2240
2241                    if isinstance(annos, Annotations):
2242                        logger.info(f"Successfully stored annotations for {annos.id}")
2243                    else:
2244                        # store annotations if they are not None
2245                        if annos:
2246                            entity_id = annos["annotations"]["id"]
2247                            logger.info(
2248                                f"Obtained and processed annotations for {entity_id} entity"
2249                            )
2250                            requests.add(
2251                                asyncio.create_task(
2252                                    self.store_async_annotation(annotation_dict=annos)
2253                                )
2254                            )
2255                except Exception as e:
2256                    raise RuntimeError(f"failed with { repr(e) }.") from e
2257
2258    @tracer.start_as_current_span("SynapseStorage::add_annotations_to_entities_files")
2259    async def add_annotations_to_entities_files(
2260        self,
2261        dmge,
2262        manifest,
2263        manifest_record_type: str,
2264        datasetId: str,
2265        hideBlanks: bool,
2266        manifest_synapse_table_id="",
2267        annotation_keys: str = "class_label",
2268    ):
2269        """
2270        Depending on upload type add Ids to entityId row. Add anotations to connected
2271        files and folders. Despite the name of this function, it also applies to folders.
2272
2273        Args:
2274            dmge: DataModelGraphExplorer Object
2275            manifest (pd.DataFrame): loaded df containing user supplied data.
2276            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2277            datasetId (str): synapse ID of folder containing the dataset
2278            hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2279            manifest_synapse_table_id (str): Default is an empty string ''.
2280            annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display
2281                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2282                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2283        Returns:
2284            manifest (pd.DataFrame): modified to add entitiyId as appropriate
2285
2286        """
2287
2288        # Expected behavior is to annotate files if `Filename` is present and if file_annotations_upload is set to True regardless of `-mrt` setting
2289        if "filename" in [col.lower() for col in manifest.columns]:
2290            # get current list of files and store as dataframe
2291            dataset_files = self.getFilesInStorageDataset(datasetId)
2292            files_and_entityIds = self._get_file_entityIds(
2293                dataset_files=dataset_files, only_new_files=False
2294            )
2295            file_df = pd.DataFrame(files_and_entityIds)
2296
2297            # Merge dataframes to add entityIds
2298            manifest = manifest.merge(
2299                file_df, how="left", on="Filename", suffixes=["_x", None]
2300            ).drop("entityId_x", axis=1)
2301
2302        # Fill `entityId` for each row if missing and annotate entity as appropriate
2303        requests = set()
2304        for idx, row in manifest.iterrows():
2305            if not row["entityId"] and (
2306                manifest_record_type == "file_and_entities"
2307                or manifest_record_type == "table_file_and_entities"
2308            ):
2309                manifest, entityId = self._create_entity_id(
2310                    idx, row, manifest, datasetId
2311                )
2312            elif not row["entityId"] and manifest_record_type == "table_and_file":
2313                # If not using entityIds, fill with manifest_table_id so
2314                row["entityId"] = manifest_synapse_table_id
2315                manifest.loc[idx, "entityId"] = manifest_synapse_table_id
2316                entityId = ""
2317                # If the row is the manifest table, do not add annotations
2318            elif row["entityId"] == manifest_synapse_table_id:
2319                entityId = ""
2320            else:
2321                # get the file id of the file to annotate, collected in above step.
2322                entityId = row["entityId"]
2323
2324            # Adding annotations to connected files.
2325            if entityId:
2326                # Format annotations for Synapse
2327                annos_task = asyncio.create_task(
2328                    self.format_row_annotations(
2329                        dmge, row, entityId, hideBlanks, annotation_keys
2330                    )
2331                )
2332                requests.add(annos_task)
2333        await self._process_store_annos(requests)
2334        return manifest
2335
2336    @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_table")
2337    def upload_manifest_as_table(
2338        self,
2339        dmge: DataModelGraphExplorer,
2340        manifest: pd.DataFrame,
2341        metadataManifestPath: str,
2342        datasetId: str,
2343        table_name: str,
2344        component_name: str,
2345        restrict: bool,
2346        manifest_record_type: str,
2347        hideBlanks: bool,
2348        table_manipulation: str,
2349        table_column_names: str,
2350        annotation_keys: str,
2351        file_annotations_upload: bool = True,
2352    ):
2353        """Upload manifest to Synapse as a table and csv.
2354        Args:
2355            dmge: DataModelGraphExplorer object
2356            manifest (pd.DataFrame): loaded df containing user supplied data.
2357            metadataManifestPath: path to csv containing a validated metadata manifest.
2358            datasetId (str): synapse ID of folder containing the dataset
2359            table_name (str): Generated to name the table being uploaded.
2360            component_name (str): Name of the component manifest that is currently being uploaded.
2361            restrict (bool): Flag for censored data.
2362            manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2363            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2364            table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2365            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2366                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2367                display label formatting.
2368            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2369                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2370                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2371            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2372        Return:
2373            manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2374        """
2375        # Upload manifest as a table, get the ID and updated manifest.
2376        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2377            dmge=dmge,
2378            manifest=manifest,
2379            datasetId=datasetId,
2380            table_name=table_name,
2381            restrict=restrict,
2382            table_manipulation=table_manipulation,
2383            table_column_names=table_column_names,
2384        )
2385
2386        if file_annotations_upload:
2387            manifest = asyncio.run(
2388                self.add_annotations_to_entities_files(
2389                    dmge,
2390                    manifest,
2391                    manifest_record_type,
2392                    datasetId,
2393                    hideBlanks,
2394                    manifest_synapse_table_id,
2395                    annotation_keys,
2396                )
2397            )
2398        # Load manifest to synapse as a CSV File
2399        manifest_synapse_file_id = self.upload_manifest_file(
2400            manifest=manifest,
2401            metadataManifestPath=metadataManifestPath,
2402            datasetId=datasetId,
2403            restrict_manifest=restrict,
2404            component_name=component_name,
2405        )
2406
2407        # Set annotations for the file manifest.
2408        manifest_annotations = self.format_manifest_annotations(
2409            manifest=manifest, manifest_synapse_id=manifest_synapse_file_id
2410        )
2411        annos = self.syn.set_annotations(annotations=manifest_annotations)
2412        manifest_entity = self.synapse_entity_tracker.get(
2413            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2414        )
2415        manifest_entity.annotations = annos
2416        manifest_entity.etag = annos.etag
2417
2418        logger.info("Associated manifest file with dataset on Synapse.")
2419
2420        # Update manifest Synapse table with new entity id column.
2421        manifest_synapse_table_id, manifest, _ = self.uploadDB(
2422            dmge=dmge,
2423            manifest=manifest,
2424            datasetId=datasetId,
2425            table_name=table_name,
2426            restrict=restrict,
2427            table_manipulation="update",
2428            table_column_names=table_column_names,
2429        )
2430
2431        # Set annotations for the table manifest
2432        manifest_annotations = self.format_manifest_annotations(
2433            manifest=manifest, manifest_synapse_id=manifest_synapse_table_id
2434        )
2435        annotations_manifest_table = self.syn.set_annotations(
2436            annotations=manifest_annotations
2437        )
2438        manifest_table_entity = self.synapse_entity_tracker.get(
2439            synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False
2440        )
2441        manifest_table_entity.annotations = annotations_manifest_table
2442        manifest_table_entity.etag = annotations_manifest_table.etag
2443
2444        return manifest_synapse_file_id
2445
2446    @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_csv")
2447    def upload_manifest_as_csv(
2448        self,
2449        dmge,
2450        manifest,
2451        metadataManifestPath,
2452        datasetId,
2453        restrict,
2454        manifest_record_type,
2455        hideBlanks,
2456        component_name,
2457        annotation_keys: str,
2458        file_annotations_upload: bool = True,
2459    ):
2460        """Upload manifest to Synapse as a csv only.
2461        Args:
2462            dmge: DataModelGraphExplorer object
2463            manifest (pd.DataFrame): loaded df containing user supplied data.
2464            metadataManifestPath: path to csv containing a validated metadata manifest.
2465            datasetId (str): synapse ID of folder containing the dataset
2466            restrict (bool): Flag for censored data.
2467            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2468            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2469            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2470                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2471                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2472            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2473        Return:
2474            manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2475        """
2476        if file_annotations_upload:
2477            manifest = asyncio.run(
2478                self.add_annotations_to_entities_files(
2479                    dmge,
2480                    manifest,
2481                    manifest_record_type,
2482                    datasetId,
2483                    hideBlanks,
2484                    annotation_keys=annotation_keys,
2485                )
2486            )
2487
2488        # Load manifest to synapse as a CSV File
2489        manifest_synapse_file_id = self.upload_manifest_file(
2490            manifest,
2491            metadataManifestPath,
2492            datasetId,
2493            restrict,
2494            component_name=component_name,
2495        )
2496
2497        # Set annotations for the file manifest.
2498        manifest_annotations = self.format_manifest_annotations(
2499            manifest, manifest_synapse_file_id
2500        )
2501        annos = self.syn.set_annotations(manifest_annotations)
2502        manifest_entity = self.synapse_entity_tracker.get(
2503            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2504        )
2505        manifest_entity.annotations = annos
2506        manifest_entity.etag = annos.etag
2507
2508        logger.info("Associated manifest file with dataset on Synapse.")
2509
2510        return manifest_synapse_file_id
2511
2512    @tracer.start_as_current_span("SynapseStorage::upload_manifest_combo")
2513    def upload_manifest_combo(
2514        self,
2515        dmge,
2516        manifest,
2517        metadataManifestPath,
2518        datasetId,
2519        table_name,
2520        component_name,
2521        restrict,
2522        manifest_record_type,
2523        hideBlanks,
2524        table_manipulation,
2525        table_column_names: str,
2526        annotation_keys: str,
2527        file_annotations_upload: bool = True,
2528    ):
2529        """Upload manifest to Synapse as a table and CSV with entities.
2530        Args:
2531            dmge: DataModelGraphExplorer object
2532            manifest (pd.DataFrame): loaded df containing user supplied data.
2533            metadataManifestPath: path to csv containing a validated metadata manifest.
2534            datasetId (str): synapse ID of folder containing the dataset
2535            table_name (str): Generated to name the table being uploaded.
2536            component_name (str): Name of the component manifest that is currently being uploaded.
2537            restrict (bool): Flag for censored data.
2538            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2539            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2540            table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2541            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2542                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2543                display label formatting.
2544            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2545                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2546                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2547            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2548        Return:
2549            manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2550        """
2551        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2552            dmge=dmge,
2553            manifest=manifest,
2554            datasetId=datasetId,
2555            table_name=table_name,
2556            restrict=restrict,
2557            table_manipulation=table_manipulation,
2558            table_column_names=table_column_names,
2559        )
2560
2561        if file_annotations_upload:
2562            manifest = asyncio.run(
2563                self.add_annotations_to_entities_files(
2564                    dmge,
2565                    manifest,
2566                    manifest_record_type,
2567                    datasetId,
2568                    hideBlanks,
2569                    manifest_synapse_table_id,
2570                    annotation_keys=annotation_keys,
2571                )
2572            )
2573
2574        # Load manifest to synapse as a CSV File
2575        manifest_synapse_file_id = self.upload_manifest_file(
2576            manifest, metadataManifestPath, datasetId, restrict, component_name
2577        )
2578
2579        # Set annotations for the file manifest.
2580        manifest_annotations = self.format_manifest_annotations(
2581            manifest, manifest_synapse_file_id
2582        )
2583        file_manifest_annoations = self.syn.set_annotations(manifest_annotations)
2584        manifest_entity = self.synapse_entity_tracker.get(
2585            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2586        )
2587        manifest_entity.annotations = file_manifest_annoations
2588        manifest_entity.etag = file_manifest_annoations.etag
2589        logger.info("Associated manifest file with dataset on Synapse.")
2590
2591        # Update manifest Synapse table with new entity id column.
2592        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2593            dmge=dmge,
2594            manifest=manifest,
2595            datasetId=datasetId,
2596            table_name=table_name,
2597            restrict=restrict,
2598            table_manipulation="update",
2599            table_column_names=table_column_names,
2600        )
2601
2602        # Set annotations for the table manifest
2603        manifest_annotations = self.format_manifest_annotations(
2604            manifest, manifest_synapse_table_id
2605        )
2606        table_manifest_annotations = self.syn.set_annotations(manifest_annotations)
2607        manifest_entity = self.synapse_entity_tracker.get(
2608            synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False
2609        )
2610        manifest_entity.annotations = table_manifest_annotations
2611        manifest_entity.etag = table_manifest_annotations.etag
2612        return manifest_synapse_file_id
2613
2614    @tracer.start_as_current_span("SynapseStorage::associateMetadataWithFiles")
2615    def associateMetadataWithFiles(
2616        self,
2617        dmge: DataModelGraphExplorer,
2618        metadataManifestPath: str,
2619        datasetId: str,
2620        manifest_record_type: str = "table_file_and_entities",
2621        hideBlanks: bool = False,
2622        restrict_manifest=False,
2623        table_manipulation: str = "replace",
2624        table_column_names: str = "class_label",
2625        annotation_keys: str = "class_label",
2626        file_annotations_upload: bool = True,
2627    ) -> str:
2628        """Associate metadata with files in a storage dataset already on Synapse.
2629        Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file.
2630
2631        If this is a new manifest there could be no Synapse entities associated with the rows of this manifest
2632        this may be due to data type (e.g. clinical data) being tabular
2633        and not requiring files; to utilize uniform interfaces downstream
2634        (i.e. fileviews), a Synapse entity (a folder) is created for each row
2635        and an entity column is added to the manifest containing the resulting
2636        entity IDs; a table is also created at present as an additional interface
2637        for downstream query and interaction with the data.
2638
2639        Args:
2640            dmge: DataModelGraphExplorer Object
2641            metadataManifestPath: path to csv containing a validated metadata manifest.
2642            The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type.
2643            Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item.
2644            In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file.
2645            datasetId: synapse ID of folder containing the dataset
2646            manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination.
2647            hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2648            restrict_manifest (bool): Default is false. Flag for censored data.
2649            table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2650            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2651                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2652                display label formatting.
2653            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2654                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2655                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2656        Returns:
2657            manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2658        """
2659        # Read new manifest CSV:
2660        manifest = self._read_manifest(metadataManifestPath)
2661        manifest = self._add_id_columns_to_manifest(manifest, dmge)
2662
2663        table_name, component_name = self._generate_table_name(manifest)
2664
2665        # Upload manifest to synapse based on user input (manifest_record_type)
2666        if manifest_record_type == "file_only":
2667            manifest_synapse_file_id = self.upload_manifest_as_csv(
2668                dmge=dmge,
2669                manifest=manifest,
2670                metadataManifestPath=metadataManifestPath,
2671                datasetId=datasetId,
2672                restrict=restrict_manifest,
2673                hideBlanks=hideBlanks,
2674                manifest_record_type=manifest_record_type,
2675                component_name=component_name,
2676                annotation_keys=annotation_keys,
2677                file_annotations_upload=file_annotations_upload,
2678            )
2679        elif manifest_record_type == "table_and_file":
2680            manifest_synapse_file_id = self.upload_manifest_as_table(
2681                dmge=dmge,
2682                manifest=manifest,
2683                metadataManifestPath=metadataManifestPath,
2684                datasetId=datasetId,
2685                table_name=table_name,
2686                component_name=component_name,
2687                restrict=restrict_manifest,
2688                hideBlanks=hideBlanks,
2689                manifest_record_type=manifest_record_type,
2690                table_manipulation=table_manipulation,
2691                table_column_names=table_column_names,
2692                annotation_keys=annotation_keys,
2693                file_annotations_upload=file_annotations_upload,
2694            )
2695        elif manifest_record_type == "file_and_entities":
2696            manifest_synapse_file_id = self.upload_manifest_as_csv(
2697                dmge=dmge,
2698                manifest=manifest,
2699                metadataManifestPath=metadataManifestPath,
2700                datasetId=datasetId,
2701                restrict=restrict_manifest,
2702                hideBlanks=hideBlanks,
2703                manifest_record_type=manifest_record_type,
2704                component_name=component_name,
2705                annotation_keys=annotation_keys,
2706                file_annotations_upload=file_annotations_upload,
2707            )
2708        elif manifest_record_type == "table_file_and_entities":
2709            manifest_synapse_file_id = self.upload_manifest_combo(
2710                dmge=dmge,
2711                manifest=manifest,
2712                metadataManifestPath=metadataManifestPath,
2713                datasetId=datasetId,
2714                table_name=table_name,
2715                component_name=component_name,
2716                restrict=restrict_manifest,
2717                hideBlanks=hideBlanks,
2718                manifest_record_type=manifest_record_type,
2719                table_manipulation=table_manipulation,
2720                table_column_names=table_column_names,
2721                annotation_keys=annotation_keys,
2722                file_annotations_upload=file_annotations_upload,
2723            )
2724        else:
2725            raise ValueError("Please enter a valid manifest_record_type.")
2726        return manifest_synapse_file_id
2727
2728    def getTableAnnotations(self, table_id: str):
2729        """Generate dictionary of annotations for the given Synapse file.
2730        Synapse returns all custom annotations as lists since they
2731        can contain multiple values. In all cases, the values will
2732        be converted into strings and concatenated with ", ".
2733
2734        Args:
2735            fileId (str): Synapse ID for dataset file.
2736
2737        Returns:
2738            dict: Annotations as comma-separated strings.
2739        """
2740        try:
2741            entity = self.synapse_entity_tracker.get(
2742                synapse_id=table_id, syn=self.syn, download_file=False
2743            )
2744            is_table = entity.concreteType.endswith(".TableEntity")
2745            annotations_raw = entity.annotations
2746        except SynapseHTTPError:
2747            # If an error occurs with retrieving entity, skip it
2748            # This could be caused by a temporary file view that
2749            # was deleted since its ID was retrieved
2750            is_file, is_table = False, False
2751
2752        # Skip anything that isn't a file or folder
2753        if not (is_table):
2754            return None
2755
2756        annotations = self.getEntityAnnotations(table_id, entity, annotations_raw)
2757
2758        return annotations
2759
2760    def getFileAnnotations(self, fileId: str) -> Dict[str, str]:
2761        """Generate dictionary of annotations for the given Synapse file.
2762        Synapse returns all custom annotations as lists since they
2763        can contain multiple values. In all cases, the values will
2764        be converted into strings and concatenated with ", ".
2765
2766        Args:
2767            fileId (str): Synapse ID for dataset file.
2768
2769        Returns:
2770            dict: Annotations as comma-separated strings.
2771        """
2772
2773        # Get entity metadata, including annotations
2774        try:
2775            entity = self.synapse_entity_tracker.get(
2776                synapse_id=fileId, syn=self.syn, download_file=False
2777            )
2778            is_file = entity.concreteType.endswith(".FileEntity")
2779            is_folder = entity.concreteType.endswith(".Folder")
2780            annotations_raw = entity.annotations
2781        except SynapseHTTPError:
2782            # If an error occurs with retrieving entity, skip it
2783            # This could be caused by a temporary file view that
2784            # was deleted since its ID was retrieved
2785            is_file, is_folder = False, False
2786
2787        # Skip anything that isn't a file or folder
2788        if not (is_file or is_folder):
2789            return None
2790
2791        annotations = self.getEntityAnnotations(fileId, entity, annotations_raw)
2792
2793        return annotations
2794
2795    def getEntityAnnotations(self, fileId, entity, annotations_raw):
2796        # Extract annotations from their lists and stringify. For example:
2797        # {'YearofBirth': [1980], 'author': ['bruno', 'milen', 'sujay']}
2798        annotations = dict()
2799        for key, vals in annotations_raw.items():
2800            if isinstance(vals, list) and len(vals) == 1:
2801                annotations[key] = str(vals[0])
2802            else:
2803                annotations[key] = ", ".join(str(v) for v in vals)
2804
2805        # Add the file entity ID and eTag, which weren't lists
2806        assert fileId == entity.id, (
2807            "For some reason, the Synapse ID in the response doesn't match"
2808            "the Synapse ID sent in the request (via synapseclient)."
2809        )
2810        annotations["entityId"] = fileId
2811        annotations["eTag"] = entity.etag
2812
2813        return annotations
2814
2815    def getDatasetAnnotations(
2816        self, datasetId: str, fill_na: bool = True, force_batch: bool = False
2817    ) -> pd.DataFrame:
2818        """Generate table for annotations across all files in given dataset.
2819
2820        Args:
2821            datasetId (str): Synapse ID for dataset folder.
2822            fill_na (bool): Whether to replace missing values with
2823                blank strings.
2824            force_batch (bool): Whether to force the function to use
2825                the batch mode, which uses a file view to retrieve
2826                annotations for a given dataset. Default to False
2827                unless there are more than 50 files in the dataset.
2828
2829        Returns:
2830            pd.DataFrame: Table of annotations.
2831        """
2832        # Get all files in given dataset
2833        dataset_files = self.getFilesInStorageDataset(datasetId)
2834
2835        # if there are no dataset files, there are no annotations
2836        # return None
2837        if not dataset_files:
2838            return pd.DataFrame()
2839
2840        dataset_files_map = dict(dataset_files)
2841        dataset_file_ids, _ = list(zip(*dataset_files))
2842
2843        # Get annotations for each file from Step 1
2844        # Batch mode
2845        try_batch = len(dataset_files) >= 50 or force_batch
2846        if try_batch:
2847            try:
2848                logger.info("Trying batch mode for retrieving Synapse annotations")
2849                table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids)
2850            except (SynapseAuthenticationError, SynapseHTTPError, ValueError):
2851                logger.info(
2852                    f"Unable to create a temporary file view bound to {datasetId}. "
2853                    "Defaulting to slower iterative retrieval of annotations."
2854                )
2855                # Default to the slower non-batch method
2856                logger.info("Batch mode failed (probably due to permission error)")
2857                try_batch = False
2858
2859        # Non-batch mode
2860        if not try_batch:
2861            logger.info("Using slower (non-batch) sequential mode")
2862            records = [self.getFileAnnotations(i) for i in dataset_file_ids]
2863            # Remove any annotations for non-file/folders (stored as None)
2864            records = filter(None, records)
2865            table = pd.DataFrame.from_records(records)
2866
2867        # Add filenames for the files that "survived" annotation retrieval
2868        filenames = [dataset_files_map[i] for i in table["entityId"]]
2869
2870        if "Filename" not in table.columns:
2871            table.insert(0, "Filename", filenames)
2872
2873        # Ensure that entityId and eTag are at the end
2874        entity_ids = table.pop("entityId")
2875        etags = table.pop("eTag")
2876        table.insert(len(table.columns), "entityId", entity_ids)
2877        table.insert(len(table.columns), "eTag", etags)
2878
2879        # Missing values are filled in with empty strings for Google Sheets
2880        if fill_na:
2881            table.fillna("", inplace=True)
2882
2883        # Force all values as strings
2884        return table.astype(str)
2885
2886    def raise_final_error(retry_state):
2887        return retry_state.outcome.result()
2888
2889    def checkIfinAssetView(self, syn_id) -> str:
2890        # get data in administrative fileview for this pipeline
2891        assetViewTable = self.getStorageFileviewTable()
2892        all_files = list(assetViewTable["id"])
2893        if syn_id in all_files:
2894            return True
2895        else:
2896            return False
2897
2898    @tracer.start_as_current_span("SynapseStorage::getDatasetProject")
2899    @retry(
2900        stop=stop_after_attempt(5),
2901        wait=wait_chain(
2902            *[wait_fixed(10) for i in range(2)]
2903            + [wait_fixed(15) for i in range(2)]
2904            + [wait_fixed(20)]
2905        ),
2906        retry=retry_if_exception_type(LookupError),
2907        retry_error_callback=raise_final_error,
2908    )
2909    def getDatasetProject(self, datasetId: str) -> str:
2910        """Get parent project for a given dataset ID.
2911
2912        Args:
2913            datasetId (str): Synapse entity ID (folder or project).
2914
2915        Raises:
2916            ValueError: Raised if Synapse ID cannot be retrieved
2917            by the user or if it doesn't appear in the file view.
2918
2919        Returns:
2920            str: The Synapse ID for the parent project.
2921        """
2922
2923        # Subset main file view
2924        dataset_index = self.storageFileviewTable["id"] == datasetId
2925        dataset_row = self.storageFileviewTable[dataset_index]
2926
2927        # re-query if no datasets found
2928        if dataset_row.empty:
2929            sleep(5)
2930            self.query_fileview(force_requery=True)
2931            # Subset main file view
2932            dataset_index = self.storageFileviewTable["id"] == datasetId
2933            dataset_row = self.storageFileviewTable[dataset_index]
2934
2935        # Return `projectId` for given row if only one found
2936        if len(dataset_row) == 1:
2937            dataset_project = dataset_row["projectId"].values[0]
2938            return dataset_project
2939
2940        # Otherwise, check if already project itself
2941        try:
2942            syn_object = self.synapse_entity_tracker.get(
2943                synapse_id=datasetId, syn=self.syn, download_file=False
2944            )
2945            if syn_object.properties["concreteType"].endswith("Project"):
2946                return datasetId
2947        except SynapseHTTPError:
2948            raise PermissionError(
2949                f"The given dataset ({datasetId}) isn't accessible with this "
2950                "user. This might be caused by a typo in the dataset Synapse ID."
2951            )
2952
2953        # If not, then assume dataset not in file view
2954        raise LookupError(
2955            f"The given dataset ({datasetId}) doesn't appear in the "
2956            f"configured file view ({self.storageFileview}). This might "
2957            "mean that the file view's scope needs to be updated."
2958        )
2959
2960    def getDatasetAnnotationsBatch(
2961        self, datasetId: str, dataset_file_ids: Sequence[str] = None
2962    ) -> pd.DataFrame:
2963        """Generate table for annotations across all files in given dataset.
2964        This function uses a temporary file view to generate a table
2965        instead of iteratively querying for individual entity annotations.
2966        This function is expected to run much faster than
2967        `self.getDatasetAnnotationsBatch` on large datasets.
2968
2969        Args:
2970            datasetId (str): Synapse ID for dataset folder.
2971            dataset_file_ids (Sequence[str]): List of Synapse IDs
2972                for dataset files/folders used to subset the table.
2973
2974        Returns:
2975            pd.DataFrame: Table of annotations.
2976        """
2977        # Create data frame from annotations file view
2978        with DatasetFileView(datasetId, self.syn) as fileview:
2979            table = fileview.query()
2980
2981        if dataset_file_ids:
2982            table = table.loc[table.index.intersection(dataset_file_ids)]
2983
2984        table = table.reset_index(drop=True)
2985
2986        return table
2987
2988    def _get_table_schema_by_cname(self, table_schema):
2989        # assume no duplicate column names in the table
2990        table_schema_by_cname = {}
2991
2992        for col_record in table_schema:
2993            # TODO clean up dictionary for compactness (e.g. remove redundant 'name' key)
2994            table_schema_by_cname[col_record["name"]] = col_record
2995
2996        return table_schema_by_cname

Implementation of Storage interface for datasets/files stored on Synapse. Provides utilities to list files in a specific project; update files annotations, create fileviews, etc.

TODO: Need to define the interface and rename and/or refactor some of the methods below.

@tracer.start_as_current_span('SynapseStorage::__init__')
SynapseStorage( token: Optional[str] = None, access_token: Optional[str] = None, project_scope: Optional[list] = None, synapse_cache_path: Optional[str] = None, perform_query: Optional[bool] = True, columns: Optional[list] = None, where_clauses: Optional[list] = None)
303    @tracer.start_as_current_span("SynapseStorage::__init__")
304    def __init__(
305        self,
306        token: Optional[str] = None,  # optional parameter retrieved from browser cookie
307        access_token: Optional[str] = None,
308        project_scope: Optional[list] = None,
309        synapse_cache_path: Optional[str] = None,
310        perform_query: Optional[bool] = True,
311        columns: Optional[list] = None,
312        where_clauses: Optional[list] = None,
313    ) -> None:
314        """Initializes a SynapseStorage object.
315
316        Args:
317            token (Optional[str], optional):
318              Optional token parameter as found in browser cookie upon login to synapse.
319              Defaults to None.
320            access_token (Optional[list], optional):
321              Optional access token (personal or oauth).
322              Defaults to None.
323            project_scope (Optional[list], optional): Defaults to None.
324            synapse_cache_path (Optional[str], optional):
325              Location of synapse cache.
326              Defaults to None.
327        TODO:
328            Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how `query_fileview` is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands.
329        """
330        self.syn = self.login(synapse_cache_path, access_token)
331        self.project_scope = project_scope
332        self.storageFileview = CONFIG.synapse_master_fileview_id
333        self.manifest = CONFIG.synapse_manifest_basename
334        self.root_synapse_cache = self.syn.cache.cache_root_dir
335        self.synapse_entity_tracker = SynapseEntityTracker()
336        if perform_query:
337            self.query_fileview(columns=columns, where_clauses=where_clauses)

Initializes a SynapseStorage object.

Arguments:
  • token (Optional[str], optional): Optional token parameter as found in browser cookie upon login to synapse. Defaults to None.
  • access_token (Optional[list], optional): Optional access token (personal or oauth). Defaults to None.
  • project_scope (Optional[list], optional): Defaults to None.
  • synapse_cache_path (Optional[str], optional): Location of synapse cache. Defaults to None.
TODO:

Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how query_fileview is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands.

syn
project_scope
storageFileview
manifest
root_synapse_cache
synapse_entity_tracker
@tracer.start_as_current_span('SynapseStorage::query_fileview')
def query_fileview( self, columns: Optional[list] = None, where_clauses: Optional[list] = None, force_requery: Optional[bool] = False) -> None:
376    @tracer.start_as_current_span("SynapseStorage::query_fileview")
377    def query_fileview(
378        self,
379        columns: Optional[list] = None,
380        where_clauses: Optional[list] = None,
381        force_requery: Optional[bool] = False,
382    ) -> None:
383        """
384        Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute.
385        Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes.
386        Args:
387            columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
388            where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
389            force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False.
390        """
391        self._purge_synapse_cache()
392
393        # Initialize to assume that the new fileview query will be different from what may already be stored. Initializes to True because generally one will not have already been performed
394        self.new_query_different = True
395
396        # If a query has already been performed, store the query
397        previous_query_built = hasattr(self, "fileview_query")
398        if previous_query_built:
399            previous_query = self.fileview_query
400
401        # Build a query with the current given parameters and check to see if it is different from the previous
402        self._build_query(columns=columns, where_clauses=where_clauses)
403        if previous_query_built:
404            self.new_query_different = self.fileview_query != previous_query
405
406        # Only perform the query if it is different from the previous query or we are forcing new results to be retrieved
407        if self.new_query_different or force_requery:
408            try:
409                self.storageFileviewTable = self.syn.tableQuery(
410                    query=self.fileview_query,
411                ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
412            except SynapseHTTPError as exc:
413                exception_text = str(exc)
414                if "Unknown column path" in exception_text:
415                    raise ValueError(
416                        "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation."
417                    )
418                elif "Unknown column" in exception_text:
419                    missing_column = exception_text.split("Unknown column ")[-1]
420                    raise ValueError(
421                        f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview."
422                    )
423                else:
424                    raise AccessCredentialsError(self.storageFileview)

Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute. Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes.

Arguments:
  • columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
  • where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
  • force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False.
@staticmethod
def build_clause_from_dataset_id( dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None) -> str:
426    @staticmethod
427    def build_clause_from_dataset_id(
428        dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None
429    ) -> str:
430        """
431        Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized.
432        Args:
433            dataset_id: Synapse ID of a dataset that should be used to limit the query
434            dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query
435        Returns:
436            clause for the query or an empty string if no dataset ID is provided
437        """
438        # Calling this method without specifying synIDs will complete but will not scope the view
439        if (not dataset_id) and (not dataset_folder_list):
440            return ""
441
442        # This will be used to gather files under a dataset recursively with a fileview query instead of walking
443        if dataset_folder_list:
444            search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list)
445            return f"parentId IN ({search_folders})"
446
447        # `dataset_id` should be provided when all files are stored directly under the dataset folder
448        return f"parentId='{dataset_id}'"

Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized.

Arguments:
  • dataset_id: Synapse ID of a dataset that should be used to limit the query
  • dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query
Returns:

clause for the query or an empty string if no dataset ID is provided

@staticmethod
@tracer.start_as_current_span('SynapseStorage::login')
def login( synapse_cache_path: Optional[str] = None, access_token: Optional[str] = None) -> synapseclient.client.Synapse:
488    @staticmethod
489    @tracer.start_as_current_span("SynapseStorage::login")
490    def login(
491        synapse_cache_path: Optional[str] = None,
492        access_token: Optional[str] = None,
493    ) -> synapseclient.Synapse:
494        """Login to Synapse
495
496        Args:
497            access_token (Optional[str], optional): A synapse access token. Defaults to None.
498            synapse_cache_path (Optional[str]): location of synapse cache
499
500        Raises:
501            ValueError: If unable to loging with access token
502
503        Returns:
504            synapseclient.Synapse: A Synapse object that is logged in
505        """
506        if not access_token:
507            access_token = os.getenv("SYNAPSE_ACCESS_TOKEN")
508
509        # login using a token
510        if access_token:
511            try:
512                syn = synapseclient.Synapse(
513                    cache_root_dir=synapse_cache_path,
514                    debug=False,
515                    skip_checks=True,
516                    cache_client=False,
517                )
518                syn.login(authToken=access_token, silent=True)
519            except SynapseHTTPError as exc:
520                raise ValueError(
521                    "No access to resources. Please make sure that your token is correct"
522                ) from exc
523        else:
524            # login using synapse credentials provided by user in .synapseConfig (default) file
525            syn = synapseclient.Synapse(
526                configPath=CONFIG.synapse_configuration_path,
527                cache_root_dir=synapse_cache_path,
528                debug=False,
529                skip_checks=True,
530                cache_client=False,
531            )
532            syn.login(silent=True)
533
534        # set user id attribute
535        current_span = trace.get_current_span()
536        if current_span.is_recording():
537            current_span.set_attribute("user.id", syn.credentials.owner_id)
538
539        return syn

Login to Synapse

Arguments:
  • access_token (Optional[str], optional): A synapse access token. Defaults to None.
  • synapse_cache_path (Optional[str]): location of synapse cache
Raises:
  • ValueError: If unable to loging with access token
Returns:

synapseclient.Synapse: A Synapse object that is logged in

def missing_entity_handler(method):
541    def missing_entity_handler(method):
542        def wrapper(*args, **kwargs):
543            try:
544                return method(*args, **kwargs)
545            except SynapseHTTPError as ex:
546                str_message = str(ex).replace("\n", "")
547                if "trash" in str_message or "does not exist" in str_message:
548                    logging.warning(str_message)
549                    return None
550                else:
551                    raise ex
552
553        return wrapper
def async_missing_entity_handler(method):
555    def async_missing_entity_handler(method):
556        """Decorator to handle missing entities in async methods."""
557
558        async def wrapper(*args: Any, **kwargs: Any) -> Any:
559            try:
560                return await method(*args, **kwargs)
561            except SynapseHTTPError as ex:
562                str_message = str(ex).replace("\n", "")
563                if "trash" in str_message or "does not exist" in str_message:
564                    logging.warning(str_message)
565                    return None
566                else:
567                    raise ex
568
569        return wrapper

Decorator to handle missing entities in async methods.

def getStorageFileviewTable(self):
571    def getStorageFileviewTable(self):
572        """Returns the storageFileviewTable obtained during initialization."""
573        return self.storageFileviewTable

Returns the storageFileviewTable obtained during initialization.

def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]:
575    def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]:
576        """Gets the paginated results of the REST call to Synapse to check what projects the current user has access to.
577
578        Args:
579            currentUserId: synapse id for the user whose projects we want to get.
580
581        Returns:
582            A dictionary with a next page token and the results.
583        """
584        all_results = self.syn.restGET(
585            "/projects/user/{principalId}".format(principalId=currentUserId)
586        )
587
588        while (
589            "nextPageToken" in all_results
590        ):  # iterate over next page token in results while there is any
591            results_token = self.syn.restGET(
592                "/projects/user/{principalId}?nextPageToken={nextPageToken}".format(
593                    principalId=currentUserId,
594                    nextPageToken=all_results["nextPageToken"],
595                )
596            )
597            all_results["results"].extend(results_token["results"])
598
599            if "nextPageToken" in results_token:
600                all_results["nextPageToken"] = results_token["nextPageToken"]
601            else:
602                del all_results["nextPageToken"]
603
604        return all_results

Gets the paginated results of the REST call to Synapse to check what projects the current user has access to.

Arguments:
  • currentUserId: synapse id for the user whose projects we want to get.
Returns:

A dictionary with a next page token and the results.

@tracer.start_as_current_span('SynapseStorage::getStorageProjects')
def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]:
606    @tracer.start_as_current_span("SynapseStorage::getStorageProjects")
607    def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]:
608        """Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute.
609
610        Returns:
611            A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName).
612        """
613
614        # get the set of all storage Synapse project accessible for this pipeline
615        storageProjects = self.storageFileviewTable["projectId"].unique()
616
617        # get the set of storage Synapse project accessible for this user
618        # get a list of projects from Synapse
619        current_user_project_headers = self.synapse_entity_tracker.get_project_headers(
620            current_user_id=self.syn.credentials.owner_id, syn=self.syn
621        )
622        project_id_to_name_dict = {}
623        current_user_projects = []
624        for project_header in current_user_project_headers:
625            project_id_to_name_dict[project_header.get("id")] = project_header.get(
626                "name"
627            )
628            current_user_projects.append(project_header.get("id"))
629
630        # find set of user projects that are also in this pipeline's storage projects set
631        storageProjects = list(set(storageProjects) & set(current_user_projects))
632
633        # Limit projects to scope if specified
634        if project_scope:
635            storageProjects = list(set(storageProjects) & set(project_scope))
636
637            if not storageProjects:
638                raise Warning(
639                    f"There are no projects that the user has access to that match the criteria of the specified project scope: {project_scope}"
640                )
641
642        # prepare a return list of project IDs and names
643        projects = []
644        for projectId in storageProjects:
645            project_name_from_project_header = project_id_to_name_dict.get(projectId)
646            projects.append((projectId, project_name_from_project_header))
647
648        sorted_projects_list = sorted(projects, key=lambda tup: tup[0])
649
650        return sorted_projects_list

Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute.

Returns:

A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName).

@tracer.start_as_current_span('SynapseStorage::getStorageDatasetsInProject')
def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]:
652    @tracer.start_as_current_span("SynapseStorage::getStorageDatasetsInProject")
653    def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]:
654        """Gets all datasets in folder under a given storage project that the current user has access to.
655
656        Args:
657            projectId: synapse ID of a storage project.
658
659        Returns:
660            A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName).
661            None: If the projectId cannot be found on Synapse.
662        """
663
664        # select all folders and fetch their names from within the storage project;
665        # if folder content type is defined, only select folders that contain datasets
666        if "contentType" in self.storageFileviewTable.columns:
667            foldersTable = self.storageFileviewTable[
668                (self.storageFileviewTable["contentType"] == "dataset")
669                & (self.storageFileviewTable["projectId"] == projectId)
670            ]
671        else:
672            foldersTable = self.storageFileviewTable[
673                (self.storageFileviewTable["type"] == "folder")
674                & (self.storageFileviewTable["parentId"] == projectId)
675            ]
676
677        # get an array of tuples (folderId, folderName)
678        # some folders are part of datasets; others contain datasets
679        # each dataset parent is the project; folders part of a dataset have another folder as a parent
680        # to get folders if and only if they contain datasets for each folder
681        # check if folder's parent is the project; if so that folder contains a dataset,
682        # unless the folder list has already been filtered to dataset folders based on contentType attribute above
683
684        datasetList = []
685        folderProperties = ["id", "name"]
686        for folder in list(
687            foldersTable[folderProperties].itertuples(index=False, name=None)
688        ):
689            datasetList.append(folder)
690
691        sorted_dataset_list = sorted(datasetList, key=lambda tup: tup[0])
692
693        return sorted_dataset_list

Gets all datasets in folder under a given storage project that the current user has access to.

Arguments:
  • projectId: synapse ID of a storage project.
Returns:

A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName). None: If the projectId cannot be found on Synapse.

@tracer.start_as_current_span('SynapseStorage::getFilesInStorageDataset')
def getFilesInStorageDataset( self, datasetId: str, fileNames: List = None, fullpath: bool = True) -> List[Tuple[str, str]]:
695    @tracer.start_as_current_span("SynapseStorage::getFilesInStorageDataset")
696    def getFilesInStorageDataset(
697        self, datasetId: str, fileNames: List = None, fullpath: bool = True
698    ) -> List[Tuple[str, str]]:
699        """Gets all files (excluding manifest files) in a given dataset folder.
700
701        Args:
702            datasetId: synapse ID of a storage dataset.
703            fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g.
704            metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present.
705            fullpath: if True return the full path as part of this filename; otherwise return just base filename
706
707        Returns:
708            A list of files; the list consists of tuples (fileId, fileName).
709
710        Raises:
711            ValueError: Dataset ID not found.
712        """
713        file_list = []
714
715        # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view
716        if self.storageFileviewTable.empty:
717            raise ValueError(
718                f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again."
719            )
720        child_path = self.storageFileviewTable.loc[
721            self.storageFileviewTable["parentId"] == datasetId, "path"
722        ]
723        if child_path.empty:
724            raise LookupError(
725                f"Dataset {datasetId} could not be found in fileview {self.storageFileview}."
726            )
727        child_path = child_path.iloc[0]
728
729        # Get the dataset path by eliminating the child's portion of the path to account for nested datasets
730        parent = child_path.split("/")[:-1]
731        parent = "/".join(parent)
732
733        # When querying, only include files to exclude entity files and subdirectories
734        where_clauses = [create_like_statement(parent), "type='file'"]
735
736        # Requery the fileview to specifically get the files in the given dataset
737        self.query_fileview(columns=["id", "path"], where_clauses=where_clauses)
738
739        # Exclude manifest files
740        non_manifest_files = self.storageFileviewTable.loc[
741            ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"),
742            :,
743        ]
744
745        # Remove all files that are not in the list of fileNames
746        if fileNames:
747            filename_regex = "|".join(fileNames)
748
749            matching_files = non_manifest_files["path"].str.contains(
750                filename_regex, case=False, regex=True
751            )
752
753            non_manifest_files = non_manifest_files.loc[matching_files, :]
754
755        # Truncate path if necessary
756        if not fullpath:
757            non_manifest_files.path = non_manifest_files.path.apply(os.path.basename)
758
759        # Return list of files as expected by other methods
760        file_list = list(non_manifest_files.itertuples(index=False, name=None))
761
762        return file_list

Gets all files (excluding manifest files) in a given dataset folder.

Arguments:
  • datasetId: synapse ID of a storage dataset.
  • fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g.
  • metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present.
  • fullpath: if True return the full path as part of this filename; otherwise return just base filename
Returns:

A list of files; the list consists of tuples (fileId, fileName).

Raises:
  • ValueError: Dataset ID not found.
@tracer.start_as_current_span('SynapseStorage::getDatasetManifest')
def getDatasetManifest( self, datasetId: str, downloadFile: bool = False, newManifestName: str = '', use_temporary_folder: bool = True) -> Union[str, synapseclient.entity.File]:
789    @tracer.start_as_current_span("SynapseStorage::getDatasetManifest")
790    def getDatasetManifest(
791        self,
792        datasetId: str,
793        downloadFile: bool = False,
794        newManifestName: str = "",
795        use_temporary_folder: bool = True,
796    ) -> Union[str, File]:
797        """Gets the manifest associated with a given dataset.
798
799        Args:
800            datasetId: synapse ID of a storage dataset.
801            downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not.
802            newManifestName: new name of a manifest that gets downloaded
803            use_temporary_folder: boolean argument indicating if a temporary folder
804                should be used to store the manifest file. This is useful when running
805                this code as an API server where multiple requests could be made at the
806                same time. This is set to False when the code is being used from the
807                CLI. Defaults to True.
808
809        Returns:
810            manifest_syn_id (String): Synapse ID of exisiting manifest file.
811            manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True.
812            "" (String): No pre-exisiting manifest in dataset.
813        """
814        manifest_data = ""
815
816        # get a list of files containing the manifest for this dataset (if any)
817        all_files = self.storageFileviewTable
818
819        # construct regex based on manifest basename in the config
820        manifest_re = re.compile(os.path.basename(self.manifest) + ".*.[tc]sv")
821
822        # search manifest based on given manifest basename regex above
823        # and return a dataframe containing name and id of manifests in a given asset view
824        manifest = all_files[
825            (all_files["name"].str.contains(manifest_re, regex=True))
826            & (all_files["parentId"] == datasetId)
827        ]
828
829        manifest = manifest[["id", "name"]]
830
831        # if there is no pre-exisiting manifest in the specified dataset
832        if manifest.empty:
833            logger.warning(
834                f"Could not find a manifest that fits basename {self.manifest} in asset view and dataset {datasetId}"
835            )
836            return ""
837
838        # if there is an exisiting manifest
839        else:
840            manifest_syn_id = self._get_manifest_id(manifest)
841            if downloadFile:
842                md = ManifestDownload(
843                    self.syn,
844                    manifest_id=manifest_syn_id,
845                    synapse_entity_tracker=self.synapse_entity_tracker,
846                )
847                manifest_data = md.download_manifest(
848                    newManifestName=newManifestName,
849                    manifest_df=manifest,
850                    use_temporary_folder=use_temporary_folder,
851                )
852                # TO DO: revisit how downstream code handle manifest_data. If the downstream code would break when manifest_data is an empty string,
853                # then we should catch the error here without returning an empty string.
854                if not manifest_data:
855                    logger.debug(
856                        f"No manifest data returned. Please check if you have successfully downloaded manifest: {manifest_syn_id}"
857                    )
858                return manifest_data
859            return manifest_syn_id

Gets the manifest associated with a given dataset.

Arguments:
  • datasetId: synapse ID of a storage dataset.
  • downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not.
  • newManifestName: new name of a manifest that gets downloaded
  • use_temporary_folder: boolean argument indicating if a temporary folder should be used to store the manifest file. This is useful when running this code as an API server where multiple requests could be made at the same time. This is set to False when the code is being used from the CLI. Defaults to True.
Returns:

manifest_syn_id (String): Synapse ID of exisiting manifest file. manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True. "" (String): No pre-exisiting manifest in dataset.

def getDataTypeFromManifest(self, manifestId: str):
861    def getDataTypeFromManifest(self, manifestId: str):
862        """Fetch a manifest and return data types of all columns
863        Args:
864            manifestId: synapse ID of a manifest
865        """
866        # get manifest file path
867        manifest_entity = self.synapse_entity_tracker.get(
868            synapse_id=manifestId, syn=self.syn, download_file=True
869        )
870        manifest_filepath = manifest_entity.path
871
872        # load manifest dataframe
873        manifest = load_df(
874            manifest_filepath,
875            preserve_raw_input=False,
876            data_model=False,
877        )
878
879        # convert the dataFrame to use best possible dtypes.
880        manifest_new = manifest.convert_dtypes()
881
882        # get data types of columns
883        result = manifest_new.dtypes.to_frame("dtypes").reset_index()
884
885        # return the result as a dictionary
886        result_dict = result.set_index("index")["dtypes"].astype(str).to_dict()
887
888        return result_dict

Fetch a manifest and return data types of all columns

Arguments:
  • manifestId: synapse ID of a manifest
def add_entity_id_and_filename( self, datasetId: str, manifest: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame:
912    def add_entity_id_and_filename(
913        self, datasetId: str, manifest: pd.DataFrame
914    ) -> pd.DataFrame:
915        """add entityid and filename column to an existing manifest assuming entityId column is not already present
916
917        Args:
918            datasetId (str): dataset syn id
919            manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty
920
921        Returns:
922            pd.DataFrame: returns a pandas dataframe
923        """
924        # get file names and entity ids of a given dataset
925        dataset_files_dict = self._get_files_metadata_from_dataset(
926            datasetId, only_new_files=False
927        )
928
929        if dataset_files_dict:
930            # turn manifest dataframe back to a dictionary for operation
931            manifest_dict = manifest.to_dict("list")
932
933            # update Filename column
934            # add entityId column to the end
935            manifest_dict.update(dataset_files_dict)
936
937            # if the component column exists in existing manifest, fill up that column
938            if "Component" in manifest_dict.keys():
939                manifest_dict["Component"] = manifest_dict["Component"] * max(
940                    1, len(manifest_dict["Filename"])
941                )
942
943            # turn dictionary back to a dataframe
944            manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient="index")
945            manifest_df_updated = manifest_df_index.transpose()
946
947            # fill na with empty string
948            manifest_df_updated = manifest_df_updated.fillna("")
949
950            # drop index
951            manifest_df_updated = manifest_df_updated.reset_index(drop=True)
952
953            return manifest_df_updated
954        else:
955            return manifest

add entityid and filename column to an existing manifest assuming entityId column is not already present

Arguments:
  • datasetId (str): dataset syn id
  • manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty
Returns:

pd.DataFrame: returns a pandas dataframe

def fill_in_entity_id_filename( self, datasetId: str, manifest: pandas.core.frame.DataFrame) -> Tuple[List, pandas.core.frame.DataFrame]:
 957    def fill_in_entity_id_filename(
 958        self, datasetId: str, manifest: pd.DataFrame
 959    ) -> Tuple[List, pd.DataFrame]:
 960        """fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present.
 961
 962        Args:
 963            datasetId (str): dataset syn id
 964            manifest (pd.DataFrame): existing manifest dataframe.
 965
 966        Returns:
 967            Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe
 968        """
 969        # get dataset file names and entity id as a list of tuple
 970        dataset_files = self.getFilesInStorageDataset(datasetId)
 971
 972        # update manifest with additional filenames, if any
 973        # note that if there is an existing manifest and there are files in the dataset
 974        # the columns Filename and entityId are assumed to be present in manifest schema
 975        # TODO: use idiomatic panda syntax
 976        if not dataset_files:
 977            manifest = manifest.fillna("")
 978            return dataset_files, manifest
 979
 980        all_files = self._get_file_entityIds(
 981            dataset_files=dataset_files, only_new_files=False, manifest=manifest
 982        )
 983        new_files = self._get_file_entityIds(
 984            dataset_files=dataset_files, only_new_files=True, manifest=manifest
 985        )
 986
 987        all_files = pd.DataFrame(all_files)
 988        new_files = pd.DataFrame(new_files)
 989
 990        # update manifest so that it contains new dataset files
 991        manifest = (
 992            pd.concat([manifest, new_files], sort=False)
 993            .reset_index()
 994            .drop("index", axis=1)
 995        )
 996
 997        # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata
 998        manifest_reindex = manifest.set_index("entityId")
 999        all_files_reindex = all_files.set_index("entityId")
1000        all_files_reindex_like_manifest = all_files_reindex.reindex_like(
1001            manifest_reindex
1002        )
1003
1004        # Check if individual file paths in manifest and from synapse match
1005        file_paths_match = (
1006            manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"]
1007        )
1008
1009        # If all the paths do not match, update the manifest with the filepaths from synapse
1010        if not file_paths_match.all():
1011            manifest_reindex.loc[
1012                ~file_paths_match, "Filename"
1013            ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"]
1014
1015            # reformat manifest for further use
1016            manifest = manifest_reindex.reset_index()
1017            entityIdCol = manifest.pop("entityId")
1018            manifest.insert(len(manifest.columns), "entityId", entityIdCol)
1019
1020        manifest = manifest.fillna("")
1021        return dataset_files, manifest

fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present.

Arguments:
  • datasetId (str): dataset syn id
  • manifest (pd.DataFrame): existing manifest dataframe.
Returns:

Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe

@tracer.start_as_current_span('SynapseStorage::updateDatasetManifestFiles')
def updateDatasetManifestFiles( self, dmge: schematic.schemas.data_model_graph.DataModelGraphExplorer, datasetId: str, store: bool = True) -> Optional[Tuple[str, pandas.core.frame.DataFrame]]:
1023    @tracer.start_as_current_span("SynapseStorage::updateDatasetManifestFiles")
1024    def updateDatasetManifestFiles(
1025        self, dmge: DataModelGraphExplorer, datasetId: str, store: bool = True
1026    ) -> Union[Tuple[str, pd.DataFrame], None]:
1027        """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any.
1028
1029        Args:
1030            dmge: DataModelGraphExplorer Instance
1031            datasetId: synapse ID of a storage dataset.
1032            store: if set to True store updated manifest in asset store; if set to False
1033            return a Pandas dataframe containing updated manifest but do not store to asset store
1034
1035
1036        Returns:
1037            Synapse ID of updated manifest and Pandas dataframe containing the updated manifest.
1038            If there is no existing manifest or if the manifest does not have an entityId column, return None
1039        """
1040
1041        # get existing manifest Synapse ID
1042        manifest_id = self.getDatasetManifest(datasetId)
1043
1044        # if there is no manifest return None
1045        if not manifest_id:
1046            return None
1047
1048        manifest_entity = self.synapse_entity_tracker.get(
1049            synapse_id=manifest_id, syn=self.syn, download_file=True
1050        )
1051        manifest_filepath = manifest_entity.path
1052        manifest = load_df(manifest_filepath)
1053
1054        # If the manifest does not have an entityId column, trigger a new manifest to be generated
1055        if "entityId" not in manifest.columns:
1056            return None
1057
1058        manifest_is_file_based = "Filename" in manifest.columns
1059
1060        if manifest_is_file_based:
1061            # update manifest with additional filenames, if any
1062            # note that if there is an existing manifest and there are files in the dataset
1063            # the columns Filename and entityId are assumed to be present in manifest schema
1064            # TODO: use idiomatic panda syntax
1065            dataset_files, manifest = self.fill_in_entity_id_filename(
1066                datasetId, manifest
1067            )
1068            if dataset_files:
1069                # update the manifest file, so that it contains the relevant entity IDs
1070                if store:
1071                    manifest.to_csv(manifest_filepath, index=False)
1072
1073                    # store manifest and update associated metadata with manifest on Synapse
1074                    manifest_id = self.associateMetadataWithFiles(
1075                        dmge, manifest_filepath, datasetId
1076                    )
1077
1078        return manifest_id, manifest

Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any.

Arguments:
  • dmge: DataModelGraphExplorer Instance
  • datasetId: synapse ID of a storage dataset.
  • store: if set to True store updated manifest in asset store; if set to False
  • return a Pandas dataframe containing updated manifest but do not store to asset store
Returns:

Synapse ID of updated manifest and Pandas dataframe containing the updated manifest. If there is no existing manifest or if the manifest does not have an entityId column, return None

@tracer.start_as_current_span('SynapseStorage::getProjectManifests')
def getProjectManifests( self, projectId: str) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]:
1124    @tracer.start_as_current_span("SynapseStorage::getProjectManifests")
1125    def getProjectManifests(
1126        self, projectId: str
1127    ) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]:
1128        """Gets all metadata manifest files across all datasets in a specified project.
1129
1130        Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest
1131                 as a list of tuples, one for each manifest:
1132                    [
1133                        (
1134                            (datasetId, dataName),
1135                            (manifestId, manifestName),
1136                            (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema
1137                        ),
1138                        ...
1139                    ]
1140
1141        TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface
1142        """
1143        component = None
1144        entity = None
1145        manifests = []
1146
1147        datasets = self.getStorageDatasetsInProject(projectId)
1148
1149        for datasetId, datasetName in datasets:
1150            # encode information about the manifest in a simple list (so that R clients can unpack it)
1151            # eventually can serialize differently
1152
1153            # Get synID of manifest for a dataset
1154            manifestId = self.getDatasetManifest(datasetId)
1155
1156            # If a manifest exists, get the annotations for it, else return base 'manifest' tuple
1157            if manifestId:
1158                annotations = self.getFileAnnotations(manifestId)
1159
1160                # If manifest has annotations specifying component, use that
1161                if annotations and "Component" in annotations:
1162                    component = annotations["Component"]
1163                    entity = self.synapse_entity_tracker.get(
1164                        synapse_id=manifestId, syn=self.syn, download_file=False
1165                    )
1166                    manifest_name = entity["properties"]["name"]
1167
1168                # otherwise download the manifest and parse for information
1169                elif not annotations or "Component" not in annotations:
1170                    logging.debug(
1171                        f"No component annotations have been found for manifest {manifestId}. "
1172                        "The manifest will be downloaded and parsed instead. "
1173                        "For increased speed, add component annotations to manifest."
1174                    )
1175
1176                    manifest_info = self.getDatasetManifest(
1177                        datasetId, downloadFile=True
1178                    )
1179                    manifest_name = manifest_info["properties"].get("name", "")
1180
1181                    if not manifest_name:
1182                        logger.error(f"Failed to download manifests from {datasetId}")
1183
1184                    manifest_path = manifest_info["path"]
1185
1186                    manifest_df = load_df(manifest_path)
1187
1188                    # Get component from component column if it exists
1189                    if (
1190                        "Component" in manifest_df
1191                        and not manifest_df["Component"].empty
1192                    ):
1193                        list(set(manifest_df["Component"]))
1194                        component = list(set(manifest_df["Component"]))
1195
1196                        # Added to address issues raised during DCA testing
1197                        if "" in component:
1198                            component.remove("")
1199
1200                        if len(component) == 1:
1201                            component = component[0]
1202                        elif len(component) > 1:
1203                            logging.warning(
1204                                f"Manifest {manifestId} is composed of multiple components. Schematic does not support mulit-component manifests at this time."
1205                                "Behavior of manifests with multiple components is undefined"
1206                            )
1207            else:
1208                manifest_name = ""
1209                component = None
1210            if component:
1211                manifest = (
1212                    (datasetId, datasetName),
1213                    (manifestId, manifest_name),
1214                    (component, component),
1215                )
1216            elif manifestId:
1217                logging.debug(
1218                    f"Manifest {manifestId} does not have an associated Component"
1219                )
1220                manifest = (
1221                    (datasetId, datasetName),
1222                    (manifestId, manifest_name),
1223                    ("", ""),
1224                )
1225            else:
1226                manifest = (
1227                    (datasetId, datasetName),
1228                    ("", ""),
1229                    ("", ""),
1230                )
1231
1232            if manifest:
1233                manifests.append(manifest)
1234
1235        return manifests

Gets all metadata manifest files across all datasets in a specified project.

Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest as a list of tuples, one for each manifest: [ ( (datasetId, dataName), (manifestId, manifestName), (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema ), ... ]

TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface

def upload_project_manifests_to_synapse( self, dmge: schematic.schemas.data_model_graph.DataModelGraphExplorer, projectId: str) -> List[str]:
1237    def upload_project_manifests_to_synapse(
1238        self, dmge: DataModelGraphExplorer, projectId: str
1239    ) -> List[str]:
1240        """Upload all metadata manifest files across all datasets in a specified project as tables in Synapse.
1241
1242        Returns: String of all the manifest_table_ids of all the manifests that have been loaded.
1243        """
1244
1245        manifests = []
1246        manifest_loaded = []
1247        datasets = self.getStorageDatasetsInProject(projectId)
1248
1249        for datasetId, datasetName in datasets:
1250            # encode information about the manifest in a simple list (so that R clients can unpack it)
1251            # eventually can serialize differently
1252
1253            manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1254
1255            manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1256            if manifest_info:
1257                manifest_id = manifest_info["properties"]["id"]
1258                manifest_name = manifest_info["properties"]["name"]
1259                manifest_path = manifest_info["path"]
1260                manifest_df = load_df(manifest_path)
1261                manifest_table_id = uploadDB(
1262                    dmge=dmge,
1263                    manifest=manifest,
1264                    datasetId=datasetId,
1265                    table_name=datasetName,
1266                )
1267                manifest_loaded.append(datasetName)
1268        return manifest_loaded

Upload all metadata manifest files across all datasets in a specified project as tables in Synapse.

Returns: String of all the manifest_table_ids of all the manifests that have been loaded.

def upload_annotated_project_manifests_to_synapse( self, projectId: str, path_to_json_ld: str, dry_run: bool = False) -> List[str]:
1270    def upload_annotated_project_manifests_to_synapse(
1271        self, projectId: str, path_to_json_ld: str, dry_run: bool = False
1272    ) -> List[str]:
1273        """
1274        Purpose:
1275            For all manifests in a project, upload them as a table and add annotations manifest csv.
1276            Assumes the manifest is already present as a CSV in a dataset in the project.
1277
1278        """
1279        # Instantiate DataModelParser
1280        data_model_parser = DataModelParser(path_to_data_model=path_to_json_ld)
1281        # Parse Model
1282        parsed_data_model = data_model_parser.parse_model()
1283
1284        # Instantiate DataModelGraph
1285        data_model_grapher = DataModelGraph(parsed_data_model)
1286
1287        # Generate graph
1288        graph_data_model = data_model_grapher.generate_data_model_graph()
1289
1290        # Instantiate DataModelGraphExplorer
1291        dmge = DataModelGraphExplorer(graph_data_model)
1292
1293        manifests = []
1294        manifest_loaded = []
1295        datasets = self.getStorageDatasetsInProject(projectId)
1296        for datasetId, datasetName in datasets:
1297            # encode information about the manifest in a simple list (so that R clients can unpack it)
1298            # eventually can serialize differently
1299
1300            manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1301            manifests.append(manifest)
1302
1303            manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1304
1305            if manifest_info:
1306                manifest_id = manifest_info["properties"]["id"]
1307                manifest_name = manifest_info["properties"]["name"]
1308                manifest_path = manifest_info["path"]
1309                manifest = (
1310                    (datasetId, datasetName),
1311                    (manifest_id, manifest_name),
1312                    ("", ""),
1313                )
1314                if not dry_run:
1315                    self.associateMetadataWithFiles(
1316                        dmge, manifest_path, datasetId, manifest_record_type="table"
1317                    )
1318                manifest_loaded.append(manifest)
1319
1320        return manifests, manifest_loaded
Purpose:

For all manifests in a project, upload them as a table and add annotations manifest csv. Assumes the manifest is already present as a CSV in a dataset in the project.

def move_entities_to_new_project( self, projectId: str, newProjectId: str, returnEntities: bool = False, dry_run: bool = False):
1322    def move_entities_to_new_project(
1323        self,
1324        projectId: str,
1325        newProjectId: str,
1326        returnEntities: bool = False,
1327        dry_run: bool = False,
1328    ):
1329        """
1330        For each manifest csv in a project, look for all the entitiy ids that are associated.
1331        Look up the entitiy in the files, move the entity to new project.
1332        """
1333
1334        manifests = []
1335        manifest_loaded = []
1336        datasets = self.getStorageDatasetsInProject(projectId)
1337        if datasets:
1338            for datasetId, datasetName in datasets:
1339                # encode information about the manifest in a simple list (so that R clients can unpack it)
1340                # eventually can serialize differently
1341
1342                manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1343                manifests.append(manifest)
1344
1345                manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1346                if manifest_info:
1347                    manifest_id = manifest_info["properties"]["id"]
1348                    manifest_name = manifest_info["properties"]["name"]
1349                    manifest_path = manifest_info["path"]
1350                    manifest_df = load_df(manifest_path)
1351
1352                    manifest = (
1353                        (datasetId, datasetName),
1354                        (manifest_id, manifest_name),
1355                        ("", ""),
1356                    )
1357                    manifest_loaded.append(manifest)
1358
1359                    annotation_entities = self.storageFileviewTable[
1360                        (self.storageFileviewTable["id"].isin(manifest_df["entityId"]))
1361                        & (self.storageFileviewTable["type"] == "folder")
1362                    ]["id"]
1363
1364                    if returnEntities:
1365                        for entityId in annotation_entities:
1366                            if not dry_run:
1367                                moved_entity = self.syn.move(entityId, datasetId)
1368                                self.synapse_entity_tracker.add(
1369                                    synapse_id=moved_entity.id, entity=moved_entity
1370                                )
1371                            else:
1372                                logging.info(
1373                                    f"{entityId} will be moved to folder {datasetId}."
1374                                )
1375                    else:
1376                        # generate project folder
1377                        archive_project_folder = Folder(
1378                            projectId + "_archive", parent=newProjectId
1379                        )
1380                        archive_project_folder = self.syn.store(archive_project_folder)
1381                        self.synapse_entity_tracker.add(
1382                            synapse_id=archive_project_folder.id,
1383                            entity=archive_project_folder,
1384                        )
1385
1386                        # generate dataset folder
1387                        dataset_archive_folder = Folder(
1388                            "_".join([datasetId, datasetName, "archive"]),
1389                            parent=archive_project_folder.id,
1390                        )
1391                        dataset_archive_folder = self.syn.store(dataset_archive_folder)
1392                        self.synapse_entity_tracker.add(
1393                            synapse_id=dataset_archive_folder.id,
1394                            entity=dataset_archive_folder,
1395                        )
1396
1397                        for entityId in annotation_entities:
1398                            # move entities to folder
1399                            if not dry_run:
1400                                moved_entity = self.syn.move(
1401                                    entityId, dataset_archive_folder.id
1402                                )
1403                                self.synapse_entity_tracker.add(
1404                                    synapse_id=moved_entity.id, entity=moved_entity
1405                                )
1406                            else:
1407                                logging.info(
1408                                    f"{entityId} will be moved to folder {dataset_archive_folder.id}."
1409                                )
1410        else:
1411            raise LookupError(
1412                f"No datasets were found in the specified project: {projectId}. Re-check specified master_fileview in CONFIG and retry."
1413            )
1414        return manifests, manifest_loaded

For each manifest csv in a project, look for all the entitiy ids that are associated. Look up the entitiy in the files, move the entity to new project.

@tracer.start_as_current_span('SynapseStorage::get_synapse_table')
def get_synapse_table( self, synapse_id: str) -> Tuple[pandas.core.frame.DataFrame, synapseclient.table.CsvFileTable]:
1416    @tracer.start_as_current_span("SynapseStorage::get_synapse_table")
1417    def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable]:
1418        """Download synapse table as a pd dataframe; return table schema and etags as results too
1419
1420        Args:
1421            synapse_id: synapse ID of the table to query
1422        """
1423
1424        results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id))
1425        df = results.asDataFrame(
1426            rowIdAndVersionInIndex=False,
1427            na_values=STR_NA_VALUES_FILTERED,
1428            keep_default_na=False,
1429        )
1430
1431        return df, results

Download synapse table as a pd dataframe; return table schema and etags as results too

Arguments:
  • synapse_id: synapse ID of the table to query
def uploadDB(*args, **kwargs):
542        def wrapper(*args, **kwargs):
543            try:
544                return method(*args, **kwargs)
545            except SynapseHTTPError as ex:
546                str_message = str(ex).replace("\n", "")
547                if "trash" in str_message or "does not exist" in str_message:
548                    logging.warning(str_message)
549                    return None
550                else:
551                    raise ex

Method to upload a database to an asset store. In synapse, this will upload a metadata table

Arguments:
  • dmge: DataModelGraphExplorer object
  • manifest: pd.Df manifest to upload
  • datasetId: synID of the dataset for the manifest
  • table_name: name of the table to be uploaded
  • restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
  • existingTableId: str of the synId of the existing table, if one already exists
  • table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
  • table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
Returns:

manifest_table_id: synID of the uploaded table manifest: the original manifset table_manifest: manifest formatted appropriately for the table

@tracer.start_as_current_span('SynapseStorage::formatDB')
def formatDB(self, dmge, manifest, table_column_names):
1482    @tracer.start_as_current_span("SynapseStorage::formatDB")
1483    def formatDB(self, dmge, manifest, table_column_names):
1484        """
1485        Method to format a manifest appropriatly for upload as table
1486
1487        Args:
1488            dmge: DataModelGraphExplorer object
1489            manifest: pd.Df manifest to upload
1490            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
1491                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
1492                display label formatting.
1493        Returns:
1494            col_schema: schema for table columns: type, size, etc
1495            table_manifest: formatted manifest
1496
1497        """
1498        # Rename the manifest columns to display names to match fileview
1499
1500        blacklist_chars = ["(", ")", ".", " ", "-"]
1501        manifest_columns = manifest.columns.tolist()
1502
1503        table_manifest = deepcopy(manifest)
1504
1505        if table_column_names == "display_name":
1506            cols = table_manifest.columns
1507
1508        elif table_column_names == "display_label":
1509            cols = [
1510                str(col).translate({ord(x): "" for x in blacklist_chars})
1511                for col in manifest_columns
1512            ]
1513
1514        elif table_column_names == "class_label":
1515            cols = [
1516                get_class_label_from_display_name(str(col)).translate(
1517                    {ord(x): "" for x in blacklist_chars}
1518                )
1519                for col in manifest_columns
1520            ]
1521        else:
1522            ValueError(
1523                f"The provided table_column_name: {table_column_names} is not valid, please resubmit with an allowed value only."
1524            )
1525
1526        cols = list(map(lambda x: x.replace("EntityId", "entityId"), cols))
1527
1528        # Reset column names in table manifest
1529        table_manifest.columns = cols
1530
1531        # move entity id to end of df
1532        entity_col = table_manifest.pop("entityId")
1533        table_manifest.insert(len(table_manifest.columns), "entityId", entity_col)
1534
1535        # Get the column schema
1536        col_schema = as_table_columns(table_manifest)
1537
1538        # Set Id column length to 64 (for some reason not being auto set.)
1539        for i, col in enumerate(col_schema):
1540            if col["name"].lower() == "id":
1541                col_schema[i]["maximumSize"] = 64
1542
1543        return col_schema, table_manifest

Method to format a manifest appropriatly for upload as table

Arguments:
  • dmge: DataModelGraphExplorer object
  • manifest: pd.Df manifest to upload
  • table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
Returns:

col_schema: schema for table columns: type, size, etc table_manifest: formatted manifest

@tracer.start_as_current_span('SynapseStorage::buildDB')
def buildDB( self, datasetId: str, table_name: str, col_schema: List, table_manifest: pandas.core.frame.DataFrame, table_manipulation: str, dmge: schematic.schemas.data_model_graph.DataModelGraphExplorer, restrict: bool = False):
1545    @tracer.start_as_current_span("SynapseStorage::buildDB")
1546    def buildDB(
1547        self,
1548        datasetId: str,
1549        table_name: str,
1550        col_schema: List,
1551        table_manifest: pd.DataFrame,
1552        table_manipulation: str,
1553        dmge: DataModelGraphExplorer,
1554        restrict: bool = False,
1555    ):
1556        """
1557        Method to construct the table appropriately: create new table, replace existing, or upsert new into existing
1558        Calls TableOperations class to execute
1559
1560        Args:
1561            datasetId: synID of the dataset for the manifest
1562            table_name: name of the table to be uploaded
1563            col_schema: schema for table columns: type, size, etc from `formatDB`
1564            table_manifest: formatted manifest that can be uploaded as a table
1565            table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
1566            restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
1567
1568        Returns:
1569            manifest_table_id: synID of the uploaded table
1570
1571        """
1572        table_parent_id = self.getDatasetProject(datasetId=datasetId)
1573        existing_table_id = self.syn.findEntityId(
1574            name=table_name, parent=table_parent_id
1575        )
1576        tableOps = TableOperations(
1577            synStore=self,
1578            tableToLoad=table_manifest,
1579            tableName=table_name,
1580            datasetId=datasetId,
1581            existingTableId=existing_table_id,
1582            restrict=restrict,
1583            synapse_entity_tracker=self.synapse_entity_tracker,
1584        )
1585
1586        if not table_manipulation or existing_table_id is None:
1587            manifest_table_id = tableOps.createTable(
1588                columnTypeDict=col_schema,
1589                specifySchema=True,
1590            )
1591        elif existing_table_id is not None:
1592            if table_manipulation.lower() == "replace":
1593                manifest_table_id = tableOps.replaceTable(
1594                    specifySchema=True,
1595                    columnTypeDict=col_schema,
1596                )
1597            elif table_manipulation.lower() == "upsert":
1598                manifest_table_id = tableOps.upsertTable(
1599                    dmge=dmge,
1600                )
1601            elif table_manipulation.lower() == "update":
1602                manifest_table_id = tableOps.updateTable()
1603
1604        if table_manipulation and table_manipulation.lower() == "upsert":
1605            table_entity = self.synapse_entity_tracker.get(
1606                synapse_id=existing_table_id or manifest_table_id,
1607                syn=self.syn,
1608                download_file=False,
1609            )
1610            annos = OldAnnotations(
1611                id=table_entity.id,
1612                etag=table_entity.etag,
1613                values=table_entity.annotations,
1614            )
1615            annos["primary_key"] = table_manifest["Component"][0] + "_id"
1616            annos = self.syn.set_annotations(annos)
1617            table_entity.etag = annos.etag
1618            table_entity.annotations = annos
1619
1620        return manifest_table_id

Method to construct the table appropriately: create new table, replace existing, or upsert new into existing Calls TableOperations class to execute

Arguments:
  • datasetId: synID of the dataset for the manifest
  • table_name: name of the table to be uploaded
  • col_schema: schema for table columns: type, size, etc from formatDB
  • table_manifest: formatted manifest that can be uploaded as a table
  • table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
  • restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
Returns:

manifest_table_id: synID of the uploaded table

@tracer.start_as_current_span('SynapseStorage::upload_manifest_file')
def upload_manifest_file( self, manifest, metadataManifestPath, datasetId, restrict_manifest, component_name=''):
1622    @tracer.start_as_current_span("SynapseStorage::upload_manifest_file")
1623    def upload_manifest_file(
1624        self,
1625        manifest,
1626        metadataManifestPath,
1627        datasetId,
1628        restrict_manifest,
1629        component_name="",
1630    ):
1631        # Update manifest to have the new entityId column
1632        manifest.to_csv(metadataManifestPath, index=False)
1633
1634        # store manifest to Synapse as a CSV
1635        # update file name
1636        file_name_full = metadataManifestPath.split("/")[-1]
1637        file_extension = file_name_full.split(".")[-1]
1638
1639        # Differentiate "censored" and "uncensored" manifest
1640        if "censored" in file_name_full:
1641            file_name_new = (
1642                os.path.basename(CONFIG.synapse_manifest_basename)
1643                + "_"
1644                + component_name
1645                + "_censored"
1646                + "."
1647                + file_extension
1648            )
1649        else:
1650            file_name_new = (
1651                os.path.basename(CONFIG.synapse_manifest_basename)
1652                + "_"
1653                + component_name
1654                + "."
1655                + file_extension
1656            )
1657
1658        manifest_synapse_file = None
1659        try:
1660            # Rename the file to file_name_new then revert
1661            # This is to maintain the original file name in-case other code is
1662            # expecting that the file exists with the original name
1663            original_file_path = metadataManifestPath
1664            new_file_path = os.path.join(
1665                os.path.dirname(metadataManifestPath), file_name_new
1666            )
1667            os.rename(original_file_path, new_file_path)
1668
1669            manifest_synapse_file = self._store_file_for_manifest_upload(
1670                new_file_path=new_file_path,
1671                dataset_id=datasetId,
1672                existing_file_name=file_name_full,
1673                file_name_new=file_name_new,
1674                restrict_manifest=restrict_manifest,
1675            )
1676            manifest_synapse_file_id = manifest_synapse_file.id
1677
1678        finally:
1679            # Revert the file name back to the original
1680            os.rename(new_file_path, original_file_path)
1681
1682            if manifest_synapse_file:
1683                manifest_synapse_file.path = original_file_path
1684
1685        return manifest_synapse_file_id
async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]:
1742    async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]:
1743        """get annotations asynchronously
1744
1745        Args:
1746            synapse_id (str): synapse id of the entity that the annotation belongs
1747
1748        Returns:
1749            Dict[str, Any]: The requested entity bundle matching
1750            <https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html>
1751        """
1752        return await get_entity_id_bundle2(
1753            entity_id=synapse_id,
1754            request={"includeAnnotations": True},
1755            synapse_client=self.syn,
1756        )

get annotations asynchronously

Arguments:
  • synapse_id (str): synapse id of the entity that the annotation belongs
Returns:

Dict[str, Any]: The requested entity bundle matching https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html

async def store_async_annotation( self, annotation_dict: dict) -> synapseclient.models.annotations.Annotations:
1758    async def store_async_annotation(self, annotation_dict: dict) -> Annotations:
1759        """store annotation in an async way
1760
1761        Args:
1762            annotation_dict (dict): annotation in a dictionary format
1763
1764        Returns:
1765            Annotations: The stored annotations.
1766        """
1767        annotation_data = Annotations.from_dict(
1768            synapse_annotations=annotation_dict["annotations"]["annotations"]
1769        )
1770        annotation_class = Annotations(
1771            annotations=annotation_data,
1772            etag=annotation_dict["annotations"]["etag"],
1773            id=annotation_dict["annotations"]["id"],
1774        )
1775        annotation_storage_result = await annotation_class.store_async(
1776            synapse_client=self.syn
1777        )
1778        local_entity = self.synapse_entity_tracker.get(
1779            synapse_id=annotation_dict["annotations"]["id"],
1780            syn=self.syn,
1781            download_file=False,
1782            retrieve_if_not_present=False,
1783        )
1784        if local_entity:
1785            local_entity.etag = annotation_storage_result.etag
1786            local_entity.annotations = annotation_storage_result
1787        return annotation_storage_result

store annotation in an async way

Arguments:
  • annotation_dict (dict): annotation in a dictionary format
Returns:

Annotations: The stored annotations.

def process_row_annotations( self, dmge: schematic.schemas.data_model_graph.DataModelGraphExplorer, metadata_syn: Dict[str, Any], hide_blanks: bool, csv_list_regex: str, annos: Dict[str, Any], annotation_keys: str) -> Dict[str, Any]:
1789    def process_row_annotations(
1790        self,
1791        dmge: DataModelGraphExplorer,
1792        metadata_syn: Dict[str, Any],
1793        hide_blanks: bool,
1794        csv_list_regex: str,
1795        annos: Dict[str, Any],
1796        annotation_keys: str,
1797    ) -> Dict[str, Any]:
1798        """Processes metadata annotations based on the logic below:
1799        1. Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is:
1800            An empty or whitespace-only string.
1801            A NaN value (if the annotation is a float).
1802        if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key.
1803        if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key.
1804
1805        2. If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name".
1806        Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key.
1807
1808        3. For any other conditions, assigns the original value of anno_v to the annotation key (anno_k).
1809
1810        4. Returns the updated annotations dictionary.
1811
1812        Args:
1813            dmge (DataModelGraphExplorer): data model graph explorer
1814            metadata_syn (dict): metadata used for Synapse storage
1815            hideBlanks (bool): if true, does not upload annotation keys with blank values.
1816            csv_list_regex (str): Regex to match with comma separated list
1817            annos (Dict[str, Any]): dictionary of annotation returned from synapse
1818            annotation_keys (str): display_label/class_label
1819
1820        Returns:
1821            Dict[str, Any]: annotations as a dictionary
1822
1823        ```mermaid
1824        flowchart TD
1825            A[Start] --> C{Is anno_v empty, whitespace, or NaN?}
1826            C -- Yes --> D{Is hide_blanks True?}
1827            D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing]
1828            D -- No --> F[Assign empty string to annotation key]
1829            C -- No --> G{Is anno_v a string?}
1830            G -- No --> H[Assign original value of anno_v to annotation key]
1831            G -- Yes --> I{Does anno_v match csv_list_regex?}
1832            I -- Yes --> J[Get validation rule of anno_k]
1833            J --> K{Does the validation rule contain 'list'}
1834            K -- Yes --> L[Split anno_v by commas and assign as list]
1835            I -- No --> H
1836            K -- No --> H
1837        ```
1838        """
1839        for anno_k, anno_v in metadata_syn.items():
1840            # Remove keys with nan or empty string values or string that only contains white space from dict of annotations to be uploaded
1841            # if present on current data annotation
1842            if hide_blanks and (
1843                (isinstance(anno_v, str) and anno_v.strip() == "")
1844                or (isinstance(anno_v, float) and np.isnan(anno_v))
1845            ):
1846                annos["annotations"]["annotations"].pop(anno_k) if anno_k in annos[
1847                    "annotations"
1848                ]["annotations"].keys() else annos["annotations"]["annotations"]
1849                continue
1850
1851            # Otherwise save annotation as approrpriate
1852            if isinstance(anno_v, float) and np.isnan(anno_v):
1853                annos["annotations"]["annotations"][anno_k] = ""
1854                continue
1855
1856            # Handle strings that match the csv_list_regex and pass the validation rule
1857            if isinstance(anno_v, str) and re.fullmatch(csv_list_regex, anno_v):
1858                # Use a dictionary to dynamically choose the argument
1859                param = (
1860                    {"node_display_name": anno_k}
1861                    if annotation_keys == "display_label"
1862                    else {"node_label": anno_k}
1863                )
1864                node_validation_rules = dmge.get_node_validation_rules(**param)
1865
1866                if rule_in_rule_list("list", node_validation_rules):
1867                    annos["annotations"]["annotations"][anno_k] = anno_v.split(",")
1868                    continue
1869            # default: assign the original value
1870            annos["annotations"]["annotations"][anno_k] = anno_v
1871
1872        return annos

Processes metadata annotations based on the logic below:

  1. Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is: An empty or whitespace-only string. A NaN value (if the annotation is a float). if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key. if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key.
  1. If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name". Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key.

  2. For any other conditions, assigns the original value of anno_v to the annotation key (anno_k).

  3. Returns the updated annotations dictionary.

Arguments:
  • dmge (DataModelGraphExplorer): data model graph explorer
  • metadata_syn (dict): metadata used for Synapse storage
  • hideBlanks (bool): if true, does not upload annotation keys with blank values.
  • csv_list_regex (str): Regex to match with comma separated list
  • annos (Dict[str, Any]): dictionary of annotation returned from synapse
  • annotation_keys (str): display_label/class_label
Returns:

Dict[str, Any]: annotations as a dictionary

flowchart TD A[Start] --> C{Is anno_v empty, whitespace, or NaN?} C -- Yes --> D{Is hide_blanks True?} D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing] D -- No --> F[Assign empty string to annotation key] C -- No --> G{Is anno_v a string?} G -- No --> H[Assign original value of anno_v to annotation key] G -- Yes --> I{Does anno_v match csv_list_regex?} I -- Yes --> J[Get validation rule of anno_k] J --> K{Does the validation rule contain 'list'} K -- Yes --> L[Split anno_v by commas and assign as list] I -- No --> H K -- No --> H
async def format_row_annotations(*args: Any, **kwargs: Any) -> Any:
558        async def wrapper(*args: Any, **kwargs: Any) -> Any:
559            try:
560                return await method(*args, **kwargs)
561            except SynapseHTTPError as ex:
562                str_message = str(ex).replace("\n", "")
563                if "trash" in str_message or "does not exist" in str_message:
564                    logging.warning(str_message)
565                    return None
566                else:
567                    raise ex
def format_manifest_annotations(*args, **kwargs):
542        def wrapper(*args, **kwargs):
543            try:
544                return method(*args, **kwargs)
545            except SynapseHTTPError as ex:
546                str_message = str(ex).replace("\n", "")
547                if "trash" in str_message or "does not exist" in str_message:
548                    logging.warning(str_message)
549                    return None
550                else:
551                    raise ex

Set annotations for the manifest (as a whole) so they can be applied to the manifest table or csv. For now just getting the Component.

@tracer.start_as_current_span('SynapseStorage::add_annotations_to_entities_files')
async def add_annotations_to_entities_files( self, dmge, manifest, manifest_record_type: str, datasetId: str, hideBlanks: bool, manifest_synapse_table_id='', annotation_keys: str = 'class_label'):
2258    @tracer.start_as_current_span("SynapseStorage::add_annotations_to_entities_files")
2259    async def add_annotations_to_entities_files(
2260        self,
2261        dmge,
2262        manifest,
2263        manifest_record_type: str,
2264        datasetId: str,
2265        hideBlanks: bool,
2266        manifest_synapse_table_id="",
2267        annotation_keys: str = "class_label",
2268    ):
2269        """
2270        Depending on upload type add Ids to entityId row. Add anotations to connected
2271        files and folders. Despite the name of this function, it also applies to folders.
2272
2273        Args:
2274            dmge: DataModelGraphExplorer Object
2275            manifest (pd.DataFrame): loaded df containing user supplied data.
2276            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2277            datasetId (str): synapse ID of folder containing the dataset
2278            hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2279            manifest_synapse_table_id (str): Default is an empty string ''.
2280            annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display
2281                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2282                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2283        Returns:
2284            manifest (pd.DataFrame): modified to add entitiyId as appropriate
2285
2286        """
2287
2288        # Expected behavior is to annotate files if `Filename` is present and if file_annotations_upload is set to True regardless of `-mrt` setting
2289        if "filename" in [col.lower() for col in manifest.columns]:
2290            # get current list of files and store as dataframe
2291            dataset_files = self.getFilesInStorageDataset(datasetId)
2292            files_and_entityIds = self._get_file_entityIds(
2293                dataset_files=dataset_files, only_new_files=False
2294            )
2295            file_df = pd.DataFrame(files_and_entityIds)
2296
2297            # Merge dataframes to add entityIds
2298            manifest = manifest.merge(
2299                file_df, how="left", on="Filename", suffixes=["_x", None]
2300            ).drop("entityId_x", axis=1)
2301
2302        # Fill `entityId` for each row if missing and annotate entity as appropriate
2303        requests = set()
2304        for idx, row in manifest.iterrows():
2305            if not row["entityId"] and (
2306                manifest_record_type == "file_and_entities"
2307                or manifest_record_type == "table_file_and_entities"
2308            ):
2309                manifest, entityId = self._create_entity_id(
2310                    idx, row, manifest, datasetId
2311                )
2312            elif not row["entityId"] and manifest_record_type == "table_and_file":
2313                # If not using entityIds, fill with manifest_table_id so
2314                row["entityId"] = manifest_synapse_table_id
2315                manifest.loc[idx, "entityId"] = manifest_synapse_table_id
2316                entityId = ""
2317                # If the row is the manifest table, do not add annotations
2318            elif row["entityId"] == manifest_synapse_table_id:
2319                entityId = ""
2320            else:
2321                # get the file id of the file to annotate, collected in above step.
2322                entityId = row["entityId"]
2323
2324            # Adding annotations to connected files.
2325            if entityId:
2326                # Format annotations for Synapse
2327                annos_task = asyncio.create_task(
2328                    self.format_row_annotations(
2329                        dmge, row, entityId, hideBlanks, annotation_keys
2330                    )
2331                )
2332                requests.add(annos_task)
2333        await self._process_store_annos(requests)
2334        return manifest

Depending on upload type add Ids to entityId row. Add anotations to connected files and folders. Despite the name of this function, it also applies to folders.

Arguments:
  • dmge: DataModelGraphExplorer Object
  • manifest (pd.DataFrame): loaded df containing user supplied data.
  • manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
  • datasetId (str): synapse ID of folder containing the dataset
  • hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
  • manifest_synapse_table_id (str): Default is an empty string ''.
  • annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
Returns:

manifest (pd.DataFrame): modified to add entitiyId as appropriate

@tracer.start_as_current_span('SynapseStorage::upload_manifest_as_table')
def upload_manifest_as_table( self, dmge: schematic.schemas.data_model_graph.DataModelGraphExplorer, manifest: pandas.core.frame.DataFrame, metadataManifestPath: str, datasetId: str, table_name: str, component_name: str, restrict: bool, manifest_record_type: str, hideBlanks: bool, table_manipulation: str, table_column_names: str, annotation_keys: str, file_annotations_upload: bool = True):
2336    @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_table")
2337    def upload_manifest_as_table(
2338        self,
2339        dmge: DataModelGraphExplorer,
2340        manifest: pd.DataFrame,
2341        metadataManifestPath: str,
2342        datasetId: str,
2343        table_name: str,
2344        component_name: str,
2345        restrict: bool,
2346        manifest_record_type: str,
2347        hideBlanks: bool,
2348        table_manipulation: str,
2349        table_column_names: str,
2350        annotation_keys: str,
2351        file_annotations_upload: bool = True,
2352    ):
2353        """Upload manifest to Synapse as a table and csv.
2354        Args:
2355            dmge: DataModelGraphExplorer object
2356            manifest (pd.DataFrame): loaded df containing user supplied data.
2357            metadataManifestPath: path to csv containing a validated metadata manifest.
2358            datasetId (str): synapse ID of folder containing the dataset
2359            table_name (str): Generated to name the table being uploaded.
2360            component_name (str): Name of the component manifest that is currently being uploaded.
2361            restrict (bool): Flag for censored data.
2362            manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2363            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2364            table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2365            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2366                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2367                display label formatting.
2368            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2369                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2370                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2371            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2372        Return:
2373            manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2374        """
2375        # Upload manifest as a table, get the ID and updated manifest.
2376        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2377            dmge=dmge,
2378            manifest=manifest,
2379            datasetId=datasetId,
2380            table_name=table_name,
2381            restrict=restrict,
2382            table_manipulation=table_manipulation,
2383            table_column_names=table_column_names,
2384        )
2385
2386        if file_annotations_upload:
2387            manifest = asyncio.run(
2388                self.add_annotations_to_entities_files(
2389                    dmge,
2390                    manifest,
2391                    manifest_record_type,
2392                    datasetId,
2393                    hideBlanks,
2394                    manifest_synapse_table_id,
2395                    annotation_keys,
2396                )
2397            )
2398        # Load manifest to synapse as a CSV File
2399        manifest_synapse_file_id = self.upload_manifest_file(
2400            manifest=manifest,
2401            metadataManifestPath=metadataManifestPath,
2402            datasetId=datasetId,
2403            restrict_manifest=restrict,
2404            component_name=component_name,
2405        )
2406
2407        # Set annotations for the file manifest.
2408        manifest_annotations = self.format_manifest_annotations(
2409            manifest=manifest, manifest_synapse_id=manifest_synapse_file_id
2410        )
2411        annos = self.syn.set_annotations(annotations=manifest_annotations)
2412        manifest_entity = self.synapse_entity_tracker.get(
2413            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2414        )
2415        manifest_entity.annotations = annos
2416        manifest_entity.etag = annos.etag
2417
2418        logger.info("Associated manifest file with dataset on Synapse.")
2419
2420        # Update manifest Synapse table with new entity id column.
2421        manifest_synapse_table_id, manifest, _ = self.uploadDB(
2422            dmge=dmge,
2423            manifest=manifest,
2424            datasetId=datasetId,
2425            table_name=table_name,
2426            restrict=restrict,
2427            table_manipulation="update",
2428            table_column_names=table_column_names,
2429        )
2430
2431        # Set annotations for the table manifest
2432        manifest_annotations = self.format_manifest_annotations(
2433            manifest=manifest, manifest_synapse_id=manifest_synapse_table_id
2434        )
2435        annotations_manifest_table = self.syn.set_annotations(
2436            annotations=manifest_annotations
2437        )
2438        manifest_table_entity = self.synapse_entity_tracker.get(
2439            synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False
2440        )
2441        manifest_table_entity.annotations = annotations_manifest_table
2442        manifest_table_entity.etag = annotations_manifest_table.etag
2443
2444        return manifest_synapse_file_id

Upload manifest to Synapse as a table and csv.

Arguments:
  • dmge: DataModelGraphExplorer object
  • manifest (pd.DataFrame): loaded df containing user supplied data.
  • metadataManifestPath: path to csv containing a validated metadata manifest.
  • datasetId (str): synapse ID of folder containing the dataset
  • table_name (str): Generated to name the table being uploaded.
  • component_name (str): Name of the component manifest that is currently being uploaded.
  • restrict (bool): Flag for censored data.
  • manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
  • hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
  • table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
  • table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
  • annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
  • file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
Return:

manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.

@tracer.start_as_current_span('SynapseStorage::upload_manifest_as_csv')
def upload_manifest_as_csv( self, dmge, manifest, metadataManifestPath, datasetId, restrict, manifest_record_type, hideBlanks, component_name, annotation_keys: str, file_annotations_upload: bool = True):
2446    @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_csv")
2447    def upload_manifest_as_csv(
2448        self,
2449        dmge,
2450        manifest,
2451        metadataManifestPath,
2452        datasetId,
2453        restrict,
2454        manifest_record_type,
2455        hideBlanks,
2456        component_name,
2457        annotation_keys: str,
2458        file_annotations_upload: bool = True,
2459    ):
2460        """Upload manifest to Synapse as a csv only.
2461        Args:
2462            dmge: DataModelGraphExplorer object
2463            manifest (pd.DataFrame): loaded df containing user supplied data.
2464            metadataManifestPath: path to csv containing a validated metadata manifest.
2465            datasetId (str): synapse ID of folder containing the dataset
2466            restrict (bool): Flag for censored data.
2467            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2468            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2469            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2470                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2471                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2472            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2473        Return:
2474            manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2475        """
2476        if file_annotations_upload:
2477            manifest = asyncio.run(
2478                self.add_annotations_to_entities_files(
2479                    dmge,
2480                    manifest,
2481                    manifest_record_type,
2482                    datasetId,
2483                    hideBlanks,
2484                    annotation_keys=annotation_keys,
2485                )
2486            )
2487
2488        # Load manifest to synapse as a CSV File
2489        manifest_synapse_file_id = self.upload_manifest_file(
2490            manifest,
2491            metadataManifestPath,
2492            datasetId,
2493            restrict,
2494            component_name=component_name,
2495        )
2496
2497        # Set annotations for the file manifest.
2498        manifest_annotations = self.format_manifest_annotations(
2499            manifest, manifest_synapse_file_id
2500        )
2501        annos = self.syn.set_annotations(manifest_annotations)
2502        manifest_entity = self.synapse_entity_tracker.get(
2503            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2504        )
2505        manifest_entity.annotations = annos
2506        manifest_entity.etag = annos.etag
2507
2508        logger.info("Associated manifest file with dataset on Synapse.")
2509
2510        return manifest_synapse_file_id

Upload manifest to Synapse as a csv only.

Arguments:
  • dmge: DataModelGraphExplorer object
  • manifest (pd.DataFrame): loaded df containing user supplied data.
  • metadataManifestPath: path to csv containing a validated metadata manifest.
  • datasetId (str): synapse ID of folder containing the dataset
  • restrict (bool): Flag for censored data.
  • manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
  • hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
  • annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
  • file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
Return:

manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.

@tracer.start_as_current_span('SynapseStorage::upload_manifest_combo')
def upload_manifest_combo( self, dmge, manifest, metadataManifestPath, datasetId, table_name, component_name, restrict, manifest_record_type, hideBlanks, table_manipulation, table_column_names: str, annotation_keys: str, file_annotations_upload: bool = True):
2512    @tracer.start_as_current_span("SynapseStorage::upload_manifest_combo")
2513    def upload_manifest_combo(
2514        self,
2515        dmge,
2516        manifest,
2517        metadataManifestPath,
2518        datasetId,
2519        table_name,
2520        component_name,
2521        restrict,
2522        manifest_record_type,
2523        hideBlanks,
2524        table_manipulation,
2525        table_column_names: str,
2526        annotation_keys: str,
2527        file_annotations_upload: bool = True,
2528    ):
2529        """Upload manifest to Synapse as a table and CSV with entities.
2530        Args:
2531            dmge: DataModelGraphExplorer object
2532            manifest (pd.DataFrame): loaded df containing user supplied data.
2533            metadataManifestPath: path to csv containing a validated metadata manifest.
2534            datasetId (str): synapse ID of folder containing the dataset
2535            table_name (str): Generated to name the table being uploaded.
2536            component_name (str): Name of the component manifest that is currently being uploaded.
2537            restrict (bool): Flag for censored data.
2538            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2539            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2540            table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2541            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2542                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2543                display label formatting.
2544            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2545                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2546                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2547            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2548        Return:
2549            manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2550        """
2551        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2552            dmge=dmge,
2553            manifest=manifest,
2554            datasetId=datasetId,
2555            table_name=table_name,
2556            restrict=restrict,
2557            table_manipulation=table_manipulation,
2558            table_column_names=table_column_names,
2559        )
2560
2561        if file_annotations_upload:
2562            manifest = asyncio.run(
2563                self.add_annotations_to_entities_files(
2564                    dmge,
2565                    manifest,
2566                    manifest_record_type,
2567                    datasetId,
2568                    hideBlanks,
2569                    manifest_synapse_table_id,
2570                    annotation_keys=annotation_keys,
2571                )
2572            )
2573
2574        # Load manifest to synapse as a CSV File
2575        manifest_synapse_file_id = self.upload_manifest_file(
2576            manifest, metadataManifestPath, datasetId, restrict, component_name
2577        )
2578
2579        # Set annotations for the file manifest.
2580        manifest_annotations = self.format_manifest_annotations(
2581            manifest, manifest_synapse_file_id
2582        )
2583        file_manifest_annoations = self.syn.set_annotations(manifest_annotations)
2584        manifest_entity = self.synapse_entity_tracker.get(
2585            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2586        )
2587        manifest_entity.annotations = file_manifest_annoations
2588        manifest_entity.etag = file_manifest_annoations.etag
2589        logger.info("Associated manifest file with dataset on Synapse.")
2590
2591        # Update manifest Synapse table with new entity id column.
2592        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2593            dmge=dmge,
2594            manifest=manifest,
2595            datasetId=datasetId,
2596            table_name=table_name,
2597            restrict=restrict,
2598            table_manipulation="update",
2599            table_column_names=table_column_names,
2600        )
2601
2602        # Set annotations for the table manifest
2603        manifest_annotations = self.format_manifest_annotations(
2604            manifest, manifest_synapse_table_id
2605        )
2606        table_manifest_annotations = self.syn.set_annotations(manifest_annotations)
2607        manifest_entity = self.synapse_entity_tracker.get(
2608            synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False
2609        )
2610        manifest_entity.annotations = table_manifest_annotations
2611        manifest_entity.etag = table_manifest_annotations.etag
2612        return manifest_synapse_file_id

Upload manifest to Synapse as a table and CSV with entities.

Arguments:
  • dmge: DataModelGraphExplorer object
  • manifest (pd.DataFrame): loaded df containing user supplied data.
  • metadataManifestPath: path to csv containing a validated metadata manifest.
  • datasetId (str): synapse ID of folder containing the dataset
  • table_name (str): Generated to name the table being uploaded.
  • component_name (str): Name of the component manifest that is currently being uploaded.
  • restrict (bool): Flag for censored data.
  • manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
  • hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
  • table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
  • table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
  • annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
  • file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
Return:

manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.

@tracer.start_as_current_span('SynapseStorage::associateMetadataWithFiles')
def associateMetadataWithFiles( self, dmge: schematic.schemas.data_model_graph.DataModelGraphExplorer, metadataManifestPath: str, datasetId: str, manifest_record_type: str = 'table_file_and_entities', hideBlanks: bool = False, restrict_manifest=False, table_manipulation: str = 'replace', table_column_names: str = 'class_label', annotation_keys: str = 'class_label', file_annotations_upload: bool = True) -> str:
2614    @tracer.start_as_current_span("SynapseStorage::associateMetadataWithFiles")
2615    def associateMetadataWithFiles(
2616        self,
2617        dmge: DataModelGraphExplorer,
2618        metadataManifestPath: str,
2619        datasetId: str,
2620        manifest_record_type: str = "table_file_and_entities",
2621        hideBlanks: bool = False,
2622        restrict_manifest=False,
2623        table_manipulation: str = "replace",
2624        table_column_names: str = "class_label",
2625        annotation_keys: str = "class_label",
2626        file_annotations_upload: bool = True,
2627    ) -> str:
2628        """Associate metadata with files in a storage dataset already on Synapse.
2629        Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file.
2630
2631        If this is a new manifest there could be no Synapse entities associated with the rows of this manifest
2632        this may be due to data type (e.g. clinical data) being tabular
2633        and not requiring files; to utilize uniform interfaces downstream
2634        (i.e. fileviews), a Synapse entity (a folder) is created for each row
2635        and an entity column is added to the manifest containing the resulting
2636        entity IDs; a table is also created at present as an additional interface
2637        for downstream query and interaction with the data.
2638
2639        Args:
2640            dmge: DataModelGraphExplorer Object
2641            metadataManifestPath: path to csv containing a validated metadata manifest.
2642            The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type.
2643            Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item.
2644            In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file.
2645            datasetId: synapse ID of folder containing the dataset
2646            manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination.
2647            hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2648            restrict_manifest (bool): Default is false. Flag for censored data.
2649            table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2650            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2651                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2652                display label formatting.
2653            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2654                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2655                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2656        Returns:
2657            manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2658        """
2659        # Read new manifest CSV:
2660        manifest = self._read_manifest(metadataManifestPath)
2661        manifest = self._add_id_columns_to_manifest(manifest, dmge)
2662
2663        table_name, component_name = self._generate_table_name(manifest)
2664
2665        # Upload manifest to synapse based on user input (manifest_record_type)
2666        if manifest_record_type == "file_only":
2667            manifest_synapse_file_id = self.upload_manifest_as_csv(
2668                dmge=dmge,
2669                manifest=manifest,
2670                metadataManifestPath=metadataManifestPath,
2671                datasetId=datasetId,
2672                restrict=restrict_manifest,
2673                hideBlanks=hideBlanks,
2674                manifest_record_type=manifest_record_type,
2675                component_name=component_name,
2676                annotation_keys=annotation_keys,
2677                file_annotations_upload=file_annotations_upload,
2678            )
2679        elif manifest_record_type == "table_and_file":
2680            manifest_synapse_file_id = self.upload_manifest_as_table(
2681                dmge=dmge,
2682                manifest=manifest,
2683                metadataManifestPath=metadataManifestPath,
2684                datasetId=datasetId,
2685                table_name=table_name,
2686                component_name=component_name,
2687                restrict=restrict_manifest,
2688                hideBlanks=hideBlanks,
2689                manifest_record_type=manifest_record_type,
2690                table_manipulation=table_manipulation,
2691                table_column_names=table_column_names,
2692                annotation_keys=annotation_keys,
2693                file_annotations_upload=file_annotations_upload,
2694            )
2695        elif manifest_record_type == "file_and_entities":
2696            manifest_synapse_file_id = self.upload_manifest_as_csv(
2697                dmge=dmge,
2698                manifest=manifest,
2699                metadataManifestPath=metadataManifestPath,
2700                datasetId=datasetId,
2701                restrict=restrict_manifest,
2702                hideBlanks=hideBlanks,
2703                manifest_record_type=manifest_record_type,
2704                component_name=component_name,
2705                annotation_keys=annotation_keys,
2706                file_annotations_upload=file_annotations_upload,
2707            )
2708        elif manifest_record_type == "table_file_and_entities":
2709            manifest_synapse_file_id = self.upload_manifest_combo(
2710                dmge=dmge,
2711                manifest=manifest,
2712                metadataManifestPath=metadataManifestPath,
2713                datasetId=datasetId,
2714                table_name=table_name,
2715                component_name=component_name,
2716                restrict=restrict_manifest,
2717                hideBlanks=hideBlanks,
2718                manifest_record_type=manifest_record_type,
2719                table_manipulation=table_manipulation,
2720                table_column_names=table_column_names,
2721                annotation_keys=annotation_keys,
2722                file_annotations_upload=file_annotations_upload,
2723            )
2724        else:
2725            raise ValueError("Please enter a valid manifest_record_type.")
2726        return manifest_synapse_file_id

Associate metadata with files in a storage dataset already on Synapse. Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file.

If this is a new manifest there could be no Synapse entities associated with the rows of this manifest this may be due to data type (e.g. clinical data) being tabular and not requiring files; to utilize uniform interfaces downstream (i.e. fileviews), a Synapse entity (a folder) is created for each row and an entity column is added to the manifest containing the resulting entity IDs; a table is also created at present as an additional interface for downstream query and interaction with the data.

Arguments:
  • dmge: DataModelGraphExplorer Object
  • metadataManifestPath: path to csv containing a validated metadata manifest.
  • The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type.
  • Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item.
  • In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file.
  • datasetId: synapse ID of folder containing the dataset
  • manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination.
  • hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
  • restrict_manifest (bool): Default is false. Flag for censored data.
  • table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
  • table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting.
  • annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain display label formatting while ensuring the label is formatted properly for Synapse annotations.
Returns:

manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.

def getTableAnnotations(self, table_id: str):
2728    def getTableAnnotations(self, table_id: str):
2729        """Generate dictionary of annotations for the given Synapse file.
2730        Synapse returns all custom annotations as lists since they
2731        can contain multiple values. In all cases, the values will
2732        be converted into strings and concatenated with ", ".
2733
2734        Args:
2735            fileId (str): Synapse ID for dataset file.
2736
2737        Returns:
2738            dict: Annotations as comma-separated strings.
2739        """
2740        try:
2741            entity = self.synapse_entity_tracker.get(
2742                synapse_id=table_id, syn=self.syn, download_file=False
2743            )
2744            is_table = entity.concreteType.endswith(".TableEntity")
2745            annotations_raw = entity.annotations
2746        except SynapseHTTPError:
2747            # If an error occurs with retrieving entity, skip it
2748            # This could be caused by a temporary file view that
2749            # was deleted since its ID was retrieved
2750            is_file, is_table = False, False
2751
2752        # Skip anything that isn't a file or folder
2753        if not (is_table):
2754            return None
2755
2756        annotations = self.getEntityAnnotations(table_id, entity, annotations_raw)
2757
2758        return annotations

Generate dictionary of annotations for the given Synapse file. Synapse returns all custom annotations as lists since they can contain multiple values. In all cases, the values will be converted into strings and concatenated with ", ".

Arguments:
  • fileId (str): Synapse ID for dataset file.
Returns:

dict: Annotations as comma-separated strings.

def getFileAnnotations(self, fileId: str) -> Dict[str, str]:
2760    def getFileAnnotations(self, fileId: str) -> Dict[str, str]:
2761        """Generate dictionary of annotations for the given Synapse file.
2762        Synapse returns all custom annotations as lists since they
2763        can contain multiple values. In all cases, the values will
2764        be converted into strings and concatenated with ", ".
2765
2766        Args:
2767            fileId (str): Synapse ID for dataset file.
2768
2769        Returns:
2770            dict: Annotations as comma-separated strings.
2771        """
2772
2773        # Get entity metadata, including annotations
2774        try:
2775            entity = self.synapse_entity_tracker.get(
2776                synapse_id=fileId, syn=self.syn, download_file=False
2777            )
2778            is_file = entity.concreteType.endswith(".FileEntity")
2779            is_folder = entity.concreteType.endswith(".Folder")
2780            annotations_raw = entity.annotations
2781        except SynapseHTTPError:
2782            # If an error occurs with retrieving entity, skip it
2783            # This could be caused by a temporary file view that
2784            # was deleted since its ID was retrieved
2785            is_file, is_folder = False, False
2786
2787        # Skip anything that isn't a file or folder
2788        if not (is_file or is_folder):
2789            return None
2790
2791        annotations = self.getEntityAnnotations(fileId, entity, annotations_raw)
2792
2793        return annotations

Generate dictionary of annotations for the given Synapse file. Synapse returns all custom annotations as lists since they can contain multiple values. In all cases, the values will be converted into strings and concatenated with ", ".

Arguments:
  • fileId (str): Synapse ID for dataset file.
Returns:

dict: Annotations as comma-separated strings.

def getEntityAnnotations(self, fileId, entity, annotations_raw):
2795    def getEntityAnnotations(self, fileId, entity, annotations_raw):
2796        # Extract annotations from their lists and stringify. For example:
2797        # {'YearofBirth': [1980], 'author': ['bruno', 'milen', 'sujay']}
2798        annotations = dict()
2799        for key, vals in annotations_raw.items():
2800            if isinstance(vals, list) and len(vals) == 1:
2801                annotations[key] = str(vals[0])
2802            else:
2803                annotations[key] = ", ".join(str(v) for v in vals)
2804
2805        # Add the file entity ID and eTag, which weren't lists
2806        assert fileId == entity.id, (
2807            "For some reason, the Synapse ID in the response doesn't match"
2808            "the Synapse ID sent in the request (via synapseclient)."
2809        )
2810        annotations["entityId"] = fileId
2811        annotations["eTag"] = entity.etag
2812
2813        return annotations
def getDatasetAnnotations( self, datasetId: str, fill_na: bool = True, force_batch: bool = False) -> pandas.core.frame.DataFrame:
2815    def getDatasetAnnotations(
2816        self, datasetId: str, fill_na: bool = True, force_batch: bool = False
2817    ) -> pd.DataFrame:
2818        """Generate table for annotations across all files in given dataset.
2819
2820        Args:
2821            datasetId (str): Synapse ID for dataset folder.
2822            fill_na (bool): Whether to replace missing values with
2823                blank strings.
2824            force_batch (bool): Whether to force the function to use
2825                the batch mode, which uses a file view to retrieve
2826                annotations for a given dataset. Default to False
2827                unless there are more than 50 files in the dataset.
2828
2829        Returns:
2830            pd.DataFrame: Table of annotations.
2831        """
2832        # Get all files in given dataset
2833        dataset_files = self.getFilesInStorageDataset(datasetId)
2834
2835        # if there are no dataset files, there are no annotations
2836        # return None
2837        if not dataset_files:
2838            return pd.DataFrame()
2839
2840        dataset_files_map = dict(dataset_files)
2841        dataset_file_ids, _ = list(zip(*dataset_files))
2842
2843        # Get annotations for each file from Step 1
2844        # Batch mode
2845        try_batch = len(dataset_files) >= 50 or force_batch
2846        if try_batch:
2847            try:
2848                logger.info("Trying batch mode for retrieving Synapse annotations")
2849                table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids)
2850            except (SynapseAuthenticationError, SynapseHTTPError, ValueError):
2851                logger.info(
2852                    f"Unable to create a temporary file view bound to {datasetId}. "
2853                    "Defaulting to slower iterative retrieval of annotations."
2854                )
2855                # Default to the slower non-batch method
2856                logger.info("Batch mode failed (probably due to permission error)")
2857                try_batch = False
2858
2859        # Non-batch mode
2860        if not try_batch:
2861            logger.info("Using slower (non-batch) sequential mode")
2862            records = [self.getFileAnnotations(i) for i in dataset_file_ids]
2863            # Remove any annotations for non-file/folders (stored as None)
2864            records = filter(None, records)
2865            table = pd.DataFrame.from_records(records)
2866
2867        # Add filenames for the files that "survived" annotation retrieval
2868        filenames = [dataset_files_map[i] for i in table["entityId"]]
2869
2870        if "Filename" not in table.columns:
2871            table.insert(0, "Filename", filenames)
2872
2873        # Ensure that entityId and eTag are at the end
2874        entity_ids = table.pop("entityId")
2875        etags = table.pop("eTag")
2876        table.insert(len(table.columns), "entityId", entity_ids)
2877        table.insert(len(table.columns), "eTag", etags)
2878
2879        # Missing values are filled in with empty strings for Google Sheets
2880        if fill_na:
2881            table.fillna("", inplace=True)
2882
2883        # Force all values as strings
2884        return table.astype(str)

Generate table for annotations across all files in given dataset.

Arguments:
  • datasetId (str): Synapse ID for dataset folder.
  • fill_na (bool): Whether to replace missing values with blank strings.
  • force_batch (bool): Whether to force the function to use the batch mode, which uses a file view to retrieve annotations for a given dataset. Default to False unless there are more than 50 files in the dataset.
Returns:

pd.DataFrame: Table of annotations.

def raise_final_error(retry_state):
2886    def raise_final_error(retry_state):
2887        return retry_state.outcome.result()
def checkIfinAssetView(self, syn_id) -> str:
2889    def checkIfinAssetView(self, syn_id) -> str:
2890        # get data in administrative fileview for this pipeline
2891        assetViewTable = self.getStorageFileviewTable()
2892        all_files = list(assetViewTable["id"])
2893        if syn_id in all_files:
2894            return True
2895        else:
2896            return False
@tracer.start_as_current_span('SynapseStorage::getDatasetProject')
@retry(stop=stop_after_attempt(5), wait=wait_chain(*[wait_fixed(10) for i in range(2)] + [wait_fixed(15) for i in range(2)] + [wait_fixed(20)]), retry=retry_if_exception_type(LookupError), retry_error_callback=raise_final_error)
def getDatasetProject(self, datasetId: str) -> str:
2898    @tracer.start_as_current_span("SynapseStorage::getDatasetProject")
2899    @retry(
2900        stop=stop_after_attempt(5),
2901        wait=wait_chain(
2902            *[wait_fixed(10) for i in range(2)]
2903            + [wait_fixed(15) for i in range(2)]
2904            + [wait_fixed(20)]
2905        ),
2906        retry=retry_if_exception_type(LookupError),
2907        retry_error_callback=raise_final_error,
2908    )
2909    def getDatasetProject(self, datasetId: str) -> str:
2910        """Get parent project for a given dataset ID.
2911
2912        Args:
2913            datasetId (str): Synapse entity ID (folder or project).
2914
2915        Raises:
2916            ValueError: Raised if Synapse ID cannot be retrieved
2917            by the user or if it doesn't appear in the file view.
2918
2919        Returns:
2920            str: The Synapse ID for the parent project.
2921        """
2922
2923        # Subset main file view
2924        dataset_index = self.storageFileviewTable["id"] == datasetId
2925        dataset_row = self.storageFileviewTable[dataset_index]
2926
2927        # re-query if no datasets found
2928        if dataset_row.empty:
2929            sleep(5)
2930            self.query_fileview(force_requery=True)
2931            # Subset main file view
2932            dataset_index = self.storageFileviewTable["id"] == datasetId
2933            dataset_row = self.storageFileviewTable[dataset_index]
2934
2935        # Return `projectId` for given row if only one found
2936        if len(dataset_row) == 1:
2937            dataset_project = dataset_row["projectId"].values[0]
2938            return dataset_project
2939
2940        # Otherwise, check if already project itself
2941        try:
2942            syn_object = self.synapse_entity_tracker.get(
2943                synapse_id=datasetId, syn=self.syn, download_file=False
2944            )
2945            if syn_object.properties["concreteType"].endswith("Project"):
2946                return datasetId
2947        except SynapseHTTPError:
2948            raise PermissionError(
2949                f"The given dataset ({datasetId}) isn't accessible with this "
2950                "user. This might be caused by a typo in the dataset Synapse ID."
2951            )
2952
2953        # If not, then assume dataset not in file view
2954        raise LookupError(
2955            f"The given dataset ({datasetId}) doesn't appear in the "
2956            f"configured file view ({self.storageFileview}). This might "
2957            "mean that the file view's scope needs to be updated."
2958        )

Get parent project for a given dataset ID.

Arguments:
  • datasetId (str): Synapse entity ID (folder or project).
Raises:
  • ValueError: Raised if Synapse ID cannot be retrieved
  • by the user or if it doesn't appear in the file view.
Returns:

str: The Synapse ID for the parent project.

def getDatasetAnnotationsBatch( self, datasetId: str, dataset_file_ids: Sequence[str] = None) -> pandas.core.frame.DataFrame:
2960    def getDatasetAnnotationsBatch(
2961        self, datasetId: str, dataset_file_ids: Sequence[str] = None
2962    ) -> pd.DataFrame:
2963        """Generate table for annotations across all files in given dataset.
2964        This function uses a temporary file view to generate a table
2965        instead of iteratively querying for individual entity annotations.
2966        This function is expected to run much faster than
2967        `self.getDatasetAnnotationsBatch` on large datasets.
2968
2969        Args:
2970            datasetId (str): Synapse ID for dataset folder.
2971            dataset_file_ids (Sequence[str]): List of Synapse IDs
2972                for dataset files/folders used to subset the table.
2973
2974        Returns:
2975            pd.DataFrame: Table of annotations.
2976        """
2977        # Create data frame from annotations file view
2978        with DatasetFileView(datasetId, self.syn) as fileview:
2979            table = fileview.query()
2980
2981        if dataset_file_ids:
2982            table = table.loc[table.index.intersection(dataset_file_ids)]
2983
2984        table = table.reset_index(drop=True)
2985
2986        return table

Generate table for annotations across all files in given dataset. This function uses a temporary file view to generate a table instead of iteratively querying for individual entity annotations. This function is expected to run much faster than self.getDatasetAnnotationsBatch on large datasets.

Arguments:
  • datasetId (str): Synapse ID for dataset folder.
  • dataset_file_ids (Sequence[str]): List of Synapse IDs for dataset files/folders used to subset the table.
Returns:

pd.DataFrame: Table of annotations.

class TableOperations:
2999class TableOperations:
3000    """
3001    Object to hold functions for various table operations specific to the Synapse Asset Store.
3002
3003    Currently implement operations are:
3004    createTable: upload a manifest as a new table when none exist
3005    replaceTable: replace a metadata in a table from one manifest with metadata from another manifest
3006    updateTable: add a column to a table that already exists on synapse
3007
3008    Operations currently in development are:
3009    upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest
3010    """
3011
3012    def __init__(
3013        self,
3014        synStore: SynapseStorage,
3015        tableToLoad: pd.DataFrame = None,
3016        tableName: str = None,
3017        datasetId: str = None,
3018        existingTableId: str = None,
3019        restrict: bool = False,
3020        synapse_entity_tracker: SynapseEntityTracker = None,
3021    ):
3022        """
3023        Class governing table operations (creation, replacement, upserts, updates) in schematic
3024
3025        tableToLoad: manifest formatted appropriately for the table
3026        tableName: name of the table to be uploaded
3027        datasetId: synID of the dataset for the manifest
3028        existingTableId: synId of the table currently exising on synapse (if there is one)
3029        restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
3030        synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
3031
3032        """
3033        self.synStore = synStore
3034        self.tableToLoad = tableToLoad
3035        self.tableName = tableName
3036        self.datasetId = datasetId
3037        self.existingTableId = existingTableId
3038        self.restrict = restrict
3039        self.synapse_entity_tracker = synapse_entity_tracker or SynapseEntityTracker()
3040
3041    @tracer.start_as_current_span("TableOperations::createTable")
3042    def createTable(
3043        self,
3044        columnTypeDict: dict = None,
3045        specifySchema: bool = True,
3046    ):
3047        """
3048        Method to create a table from a metadata manifest and upload it to synapse
3049
3050        Args:
3051            columnTypeDict: dictionary schema for table columns: type, size, etc
3052            specifySchema: to specify a specific schema for the table format
3053
3054        Returns:
3055            table.schema.id: synID of the newly created table
3056        """
3057        datasetEntity = self.synapse_entity_tracker.get(
3058            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3059        )
3060        datasetName = datasetEntity.name
3061        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3062
3063        if not self.tableName:
3064            self.tableName = datasetName + "table"
3065        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3066        if specifySchema:
3067            if columnTypeDict == {}:
3068                logger.error("Did not provide a columnTypeDict.")
3069            # create list of columns:
3070            cols = []
3071            for col in self.tableToLoad.columns:
3072                if col in table_schema_by_cname:
3073                    col_type = table_schema_by_cname[col]["columnType"]
3074                    max_size = (
3075                        table_schema_by_cname[col]["maximumSize"]
3076                        if "maximumSize" in table_schema_by_cname[col].keys()
3077                        else 100
3078                    )
3079                    max_list_len = 250
3080                    if max_size and max_list_len:
3081                        cols.append(
3082                            Column(
3083                                name=col,
3084                                columnType=col_type,
3085                                maximumSize=max_size,
3086                                maximumListLength=max_list_len,
3087                            )
3088                        )
3089                    elif max_size:
3090                        cols.append(
3091                            Column(name=col, columnType=col_type, maximumSize=max_size)
3092                        )
3093                    else:
3094                        cols.append(Column(name=col, columnType=col_type))
3095                else:
3096                    # TODO add warning that the given col was not found and it's max size is set to 100
3097                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3098            schema = Schema(
3099                name=self.tableName, columns=cols, parent=datasetParentProject
3100            )
3101            table = Table(schema, self.tableToLoad)
3102            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3103            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3104            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3105            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3106            return table.schema.id
3107        else:
3108            # For just uploading the tables to synapse using default
3109            # column types.
3110            table = build_table(self.tableName, datasetParentProject, self.tableToLoad)
3111            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3112            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3113            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3114            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3115            return table.schema.id
3116
3117    @tracer.start_as_current_span("TableOperations::replaceTable")
3118    def replaceTable(
3119        self,
3120        specifySchema: bool = True,
3121        columnTypeDict: dict = None,
3122    ):
3123        """
3124        Method to replace an existing table on synapse with metadata from a new manifest
3125
3126        Args:
3127            specifySchema: to infer a schema for the table format
3128            columnTypeDict: dictionary schema for table columns: type, size, etc
3129
3130        Returns:
3131           existingTableId: synID of the already existing table that had its metadata replaced
3132        """
3133        datasetEntity = self.synapse_entity_tracker.get(
3134            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3135        )
3136
3137        datasetName = datasetEntity.name
3138        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3139        existing_table, existing_results = self.synStore.get_synapse_table(
3140            self.existingTableId
3141        )
3142        # remove rows
3143        self.synStore.syn.delete(existing_results)
3144        # Data changes such as removing all rows causes the eTag to change.
3145        self.synapse_entity_tracker.remove(synapse_id=self.existingTableId)
3146        # wait for row deletion to finish on synapse before getting empty table
3147        sleep(10)
3148
3149        # removes all current columns
3150        current_table = self.synapse_entity_tracker.get(
3151            synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False
3152        )
3153
3154        current_columns = self.synStore.syn.getTableColumns(current_table)
3155
3156        for col in current_columns:
3157            current_table.removeColumn(col)
3158
3159        if not self.tableName:
3160            self.tableName = datasetName + "table"
3161
3162        # Process columns according to manifest entries
3163        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3164        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3165        if specifySchema:
3166            if columnTypeDict == {}:
3167                logger.error("Did not provide a columnTypeDict.")
3168            # create list of columns:
3169            cols = []
3170
3171            for col in self.tableToLoad.columns:
3172                if col in table_schema_by_cname:
3173                    col_type = table_schema_by_cname[col]["columnType"]
3174                    max_size = (
3175                        table_schema_by_cname[col]["maximumSize"]
3176                        if "maximumSize" in table_schema_by_cname[col].keys()
3177                        else 100
3178                    )
3179                    max_list_len = 250
3180                    if max_size and max_list_len:
3181                        cols.append(
3182                            Column(
3183                                name=col,
3184                                columnType=col_type,
3185                                maximumSize=max_size,
3186                                maximumListLength=max_list_len,
3187                            )
3188                        )
3189                    elif max_size:
3190                        cols.append(
3191                            Column(name=col, columnType=col_type, maximumSize=max_size)
3192                        )
3193                    else:
3194                        cols.append(Column(name=col, columnType=col_type))
3195                else:
3196                    # TODO add warning that the given col was not found and it's max size is set to 100
3197                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3198
3199            # adds new columns to schema
3200            for col in cols:
3201                current_table.addColumn(col)
3202
3203            table_result = self.synStore.syn.store(
3204                current_table, isRestricted=self.restrict
3205            )
3206            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3207            # self.synapse_entity_tracker.add(synapse_id=table_result.schema.id, entity=table_result.schema)
3208            self.synapse_entity_tracker.remove(synapse_id=table_result.id)
3209
3210            # wait for synapse store to finish
3211            sleep(1)
3212
3213            # build schema and table from columns and store with necessary restrictions
3214            schema = Schema(
3215                name=self.tableName, columns=cols, parent=datasetParentProject
3216            )
3217            schema.id = self.existingTableId
3218            table = Table(schema, self.tableToLoad, etag=existing_results.etag)
3219            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3220            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3221            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3222            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3223        else:
3224            logging.error("Must specify a schema for table replacements")
3225
3226        # remove system metadata from manifest
3227        existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True)
3228        return self.existingTableId
3229
3230    @tracer.start_as_current_span("TableOperations::_get_auth_token")
3231    def _get_auth_token(
3232        self,
3233    ):
3234        authtoken = None
3235
3236        # Get access token from environment variable if available
3237        # Primarily useful for testing environments, with other possible usefulness for containers
3238        env_access_token = os.getenv("SYNAPSE_ACCESS_TOKEN")
3239        if env_access_token:
3240            authtoken = env_access_token
3241            return authtoken
3242
3243        # Get token from authorization header
3244        # Primarily useful for API endpoint functionality
3245        if "Authorization" in self.synStore.syn.default_headers:
3246            authtoken = self.synStore.syn.default_headers["Authorization"].split(
3247                "Bearer "
3248            )[-1]
3249            return authtoken
3250
3251        # retrive credentials from synapse object
3252        # Primarily useful for local users, could only be stored here when a .synapseConfig file is used, but including to be safe
3253        synapse_object_creds = self.synStore.syn.credentials
3254        if hasattr(synapse_object_creds, "_token"):
3255            authtoken = synapse_object_creds.secret
3256
3257        # Try getting creds from .synapseConfig file if it exists
3258        # Primarily useful for local users. Seems to correlate with credentials stored in synaspe object when logged in
3259        if os.path.exists(CONFIG.synapse_configuration_path):
3260            config = get_config_file(CONFIG.synapse_configuration_path)
3261
3262            # check which credentials are provided in file
3263            if config.has_option("authentication", "authtoken"):
3264                authtoken = config.get("authentication", "authtoken")
3265
3266        # raise error if required credentials are not found
3267        if not authtoken:
3268            raise NameError(
3269                "authtoken credentials could not be found in the environment, synapse object, or the .synapseConfig file"
3270            )
3271
3272        return authtoken
3273
3274    @tracer.start_as_current_span("TableOperations::upsertTable")
3275    def upsertTable(self, dmge: DataModelGraphExplorer):
3276        """
3277        Method to upsert rows from a new manifest into an existing table on synapse
3278        For upsert functionality to work, primary keys must follow the naming convention of <componenet>_id
3279        `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality.
3280        Currently it is required to use -tcn "display label" with table upserts.
3281
3282
3283        Args:
3284            dmge: DataModelGraphExplorer instance
3285
3286        Returns:
3287           existingTableId: synID of the already existing table that had its metadata replaced
3288        """
3289
3290        authtoken = self._get_auth_token()
3291
3292        synapseDB = SynapseDatabase(
3293            auth_token=authtoken,
3294            project_id=self.synStore.getDatasetProject(self.datasetId),
3295            syn=self.synStore.syn,
3296            synapse_entity_tracker=self.synapse_entity_tracker,
3297        )
3298
3299        try:
3300            # Try performing upsert
3301            synapseDB.upsert_table_rows(
3302                table_name=self.tableName, data=self.tableToLoad
3303            )
3304        except SynapseHTTPError as ex:
3305            # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload
3306            if "Id is not a valid column name or id" in str(ex):
3307                self._update_table_uuid_column(dmge)
3308                synapseDB.upsert_table_rows(
3309                    table_name=self.tableName, data=self.tableToLoad
3310                )
3311            # Raise if other error
3312            else:
3313                raise ex
3314
3315        return self.existingTableId
3316
3317    @tracer.start_as_current_span("TableOperations::_update_table_uuid_column")
3318    def _update_table_uuid_column(
3319        self,
3320        dmge: DataModelGraphExplorer,
3321    ) -> None:
3322        """Removes the `Uuid` column when present, and relpaces with an `Id` column
3323        Used to enable backwards compatability for manifests using the old `Uuid` convention
3324
3325        Args:
3326            dmge: DataModelGraphExplorer instance
3327
3328        Returns:
3329            None
3330        """
3331
3332        # Get the columns of the schema
3333        schema = self.synapse_entity_tracker.get(
3334            synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False
3335        )
3336
3337        cols = self.synStore.syn.getTableColumns(schema)
3338
3339        # Iterate through columns until `Uuid` column is found
3340        for col in cols:
3341            if col.name.lower() == "uuid":
3342                # See if schema has `Uuid` column specified
3343                try:
3344                    uuid_col_in_schema = dmge.is_class_in_schema(col.name)
3345                except KeyError:
3346                    uuid_col_in_schema = False
3347
3348                # If there is, then create a new `Id` column from scratch
3349                if uuid_col_in_schema:
3350                    new_col = Column(columnType="STRING", maximumSize=64, name="Id")
3351                    schema.addColumn(new_col)
3352                    schema = self.synStore.syn.store(schema)
3353                    # self.synapse_entity_tracker.add(synapse_id=schema.id, entity=schema)
3354                    # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3355                    self.synapse_entity_tracker.remove(synapse_id=schema.id)
3356                # If there is not, then use the old `Uuid` column as a basis for the new `Id` column
3357                else:
3358                    # Build ColumnModel that will be used for new column
3359                    id_column = Column(
3360                        name="Id",
3361                        columnType="STRING",
3362                        maximumSize=64,
3363                        defaultValue=None,
3364                        maximumListLength=1,
3365                    )
3366                    new_col_response = self.synStore.syn.store(id_column)
3367
3368                    # Define columnChange body
3369                    columnChangeDict = {
3370                        "concreteType": "org.sagebionetworks.repo.model.table.TableSchemaChangeRequest",
3371                        "entityId": self.existingTableId,
3372                        "changes": [
3373                            {
3374                                "oldColumnId": col["id"],
3375                                "newColumnId": new_col_response["id"],
3376                            }
3377                        ],
3378                    }
3379
3380                    self.synStore.syn._async_table_update(
3381                        table=self.existingTableId,
3382                        changes=[columnChangeDict],
3383                        wait=False,
3384                    )
3385                break
3386
3387        return
3388
3389    @tracer.start_as_current_span("TableOperations::updateTable")
3390    def updateTable(
3391        self,
3392        update_col: str = "Id",
3393    ):
3394        """
3395        Method to update an existing table with a new column
3396
3397        Args:
3398            updateCol: column to index the old and new tables on
3399
3400        Returns:
3401           existingTableId: synID of the already existing table that had its metadata replaced
3402        """
3403        existing_table, existing_results = self.synStore.get_synapse_table(
3404            self.existingTableId
3405        )
3406
3407        self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col)
3408        # store table with existing etag data and impose restrictions as appropriate
3409        table_result = self.synStore.syn.store(
3410            Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag),
3411            isRestricted=self.restrict,
3412        )
3413        # We cannot store the Table to the `synapse_entity_tracker` because there is
3414        # not `Schema` on the table object. The above `.store()` function call would
3415        # also update the ETag of the entity within Synapse. Remove it from the tracker
3416        # and re-retrieve it later on if needed again.
3417        self.synapse_entity_tracker.remove(synapse_id=table_result.tableId)
3418
3419        return self.existingTableId

Object to hold functions for various table operations specific to the Synapse Asset Store.

Currently implement operations are: createTable: upload a manifest as a new table when none exist replaceTable: replace a metadata in a table from one manifest with metadata from another manifest updateTable: add a column to a table that already exists on synapse

Operations currently in development are: upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest

TableOperations( synStore: SynapseStorage, tableToLoad: pandas.core.frame.DataFrame = None, tableName: str = None, datasetId: str = None, existingTableId: str = None, restrict: bool = False, synapse_entity_tracker: schematic.store.synapse_tracker.SynapseEntityTracker = None)
3012    def __init__(
3013        self,
3014        synStore: SynapseStorage,
3015        tableToLoad: pd.DataFrame = None,
3016        tableName: str = None,
3017        datasetId: str = None,
3018        existingTableId: str = None,
3019        restrict: bool = False,
3020        synapse_entity_tracker: SynapseEntityTracker = None,
3021    ):
3022        """
3023        Class governing table operations (creation, replacement, upserts, updates) in schematic
3024
3025        tableToLoad: manifest formatted appropriately for the table
3026        tableName: name of the table to be uploaded
3027        datasetId: synID of the dataset for the manifest
3028        existingTableId: synId of the table currently exising on synapse (if there is one)
3029        restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
3030        synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
3031
3032        """
3033        self.synStore = synStore
3034        self.tableToLoad = tableToLoad
3035        self.tableName = tableName
3036        self.datasetId = datasetId
3037        self.existingTableId = existingTableId
3038        self.restrict = restrict
3039        self.synapse_entity_tracker = synapse_entity_tracker or SynapseEntityTracker()

Class governing table operations (creation, replacement, upserts, updates) in schematic

tableToLoad: manifest formatted appropriately for the table tableName: name of the table to be uploaded datasetId: synID of the dataset for the manifest existingTableId: synId of the table currently exising on synapse (if there is one) restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities

synStore
tableToLoad
tableName
datasetId
existingTableId
restrict
synapse_entity_tracker
@tracer.start_as_current_span('TableOperations::createTable')
def createTable(self, columnTypeDict: dict = None, specifySchema: bool = True):
3041    @tracer.start_as_current_span("TableOperations::createTable")
3042    def createTable(
3043        self,
3044        columnTypeDict: dict = None,
3045        specifySchema: bool = True,
3046    ):
3047        """
3048        Method to create a table from a metadata manifest and upload it to synapse
3049
3050        Args:
3051            columnTypeDict: dictionary schema for table columns: type, size, etc
3052            specifySchema: to specify a specific schema for the table format
3053
3054        Returns:
3055            table.schema.id: synID of the newly created table
3056        """
3057        datasetEntity = self.synapse_entity_tracker.get(
3058            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3059        )
3060        datasetName = datasetEntity.name
3061        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3062
3063        if not self.tableName:
3064            self.tableName = datasetName + "table"
3065        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3066        if specifySchema:
3067            if columnTypeDict == {}:
3068                logger.error("Did not provide a columnTypeDict.")
3069            # create list of columns:
3070            cols = []
3071            for col in self.tableToLoad.columns:
3072                if col in table_schema_by_cname:
3073                    col_type = table_schema_by_cname[col]["columnType"]
3074                    max_size = (
3075                        table_schema_by_cname[col]["maximumSize"]
3076                        if "maximumSize" in table_schema_by_cname[col].keys()
3077                        else 100
3078                    )
3079                    max_list_len = 250
3080                    if max_size and max_list_len:
3081                        cols.append(
3082                            Column(
3083                                name=col,
3084                                columnType=col_type,
3085                                maximumSize=max_size,
3086                                maximumListLength=max_list_len,
3087                            )
3088                        )
3089                    elif max_size:
3090                        cols.append(
3091                            Column(name=col, columnType=col_type, maximumSize=max_size)
3092                        )
3093                    else:
3094                        cols.append(Column(name=col, columnType=col_type))
3095                else:
3096                    # TODO add warning that the given col was not found and it's max size is set to 100
3097                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3098            schema = Schema(
3099                name=self.tableName, columns=cols, parent=datasetParentProject
3100            )
3101            table = Table(schema, self.tableToLoad)
3102            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3103            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3104            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3105            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3106            return table.schema.id
3107        else:
3108            # For just uploading the tables to synapse using default
3109            # column types.
3110            table = build_table(self.tableName, datasetParentProject, self.tableToLoad)
3111            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3112            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3113            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3114            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3115            return table.schema.id

Method to create a table from a metadata manifest and upload it to synapse

Arguments:
  • columnTypeDict: dictionary schema for table columns: type, size, etc
  • specifySchema: to specify a specific schema for the table format
Returns:

table.schema.id: synID of the newly created table

@tracer.start_as_current_span('TableOperations::replaceTable')
def replaceTable(self, specifySchema: bool = True, columnTypeDict: dict = None):
3117    @tracer.start_as_current_span("TableOperations::replaceTable")
3118    def replaceTable(
3119        self,
3120        specifySchema: bool = True,
3121        columnTypeDict: dict = None,
3122    ):
3123        """
3124        Method to replace an existing table on synapse with metadata from a new manifest
3125
3126        Args:
3127            specifySchema: to infer a schema for the table format
3128            columnTypeDict: dictionary schema for table columns: type, size, etc
3129
3130        Returns:
3131           existingTableId: synID of the already existing table that had its metadata replaced
3132        """
3133        datasetEntity = self.synapse_entity_tracker.get(
3134            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3135        )
3136
3137        datasetName = datasetEntity.name
3138        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3139        existing_table, existing_results = self.synStore.get_synapse_table(
3140            self.existingTableId
3141        )
3142        # remove rows
3143        self.synStore.syn.delete(existing_results)
3144        # Data changes such as removing all rows causes the eTag to change.
3145        self.synapse_entity_tracker.remove(synapse_id=self.existingTableId)
3146        # wait for row deletion to finish on synapse before getting empty table
3147        sleep(10)
3148
3149        # removes all current columns
3150        current_table = self.synapse_entity_tracker.get(
3151            synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False
3152        )
3153
3154        current_columns = self.synStore.syn.getTableColumns(current_table)
3155
3156        for col in current_columns:
3157            current_table.removeColumn(col)
3158
3159        if not self.tableName:
3160            self.tableName = datasetName + "table"
3161
3162        # Process columns according to manifest entries
3163        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3164        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3165        if specifySchema:
3166            if columnTypeDict == {}:
3167                logger.error("Did not provide a columnTypeDict.")
3168            # create list of columns:
3169            cols = []
3170
3171            for col in self.tableToLoad.columns:
3172                if col in table_schema_by_cname:
3173                    col_type = table_schema_by_cname[col]["columnType"]
3174                    max_size = (
3175                        table_schema_by_cname[col]["maximumSize"]
3176                        if "maximumSize" in table_schema_by_cname[col].keys()
3177                        else 100
3178                    )
3179                    max_list_len = 250
3180                    if max_size and max_list_len:
3181                        cols.append(
3182                            Column(
3183                                name=col,
3184                                columnType=col_type,
3185                                maximumSize=max_size,
3186                                maximumListLength=max_list_len,
3187                            )
3188                        )
3189                    elif max_size:
3190                        cols.append(
3191                            Column(name=col, columnType=col_type, maximumSize=max_size)
3192                        )
3193                    else:
3194                        cols.append(Column(name=col, columnType=col_type))
3195                else:
3196                    # TODO add warning that the given col was not found and it's max size is set to 100
3197                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3198
3199            # adds new columns to schema
3200            for col in cols:
3201                current_table.addColumn(col)
3202
3203            table_result = self.synStore.syn.store(
3204                current_table, isRestricted=self.restrict
3205            )
3206            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3207            # self.synapse_entity_tracker.add(synapse_id=table_result.schema.id, entity=table_result.schema)
3208            self.synapse_entity_tracker.remove(synapse_id=table_result.id)
3209
3210            # wait for synapse store to finish
3211            sleep(1)
3212
3213            # build schema and table from columns and store with necessary restrictions
3214            schema = Schema(
3215                name=self.tableName, columns=cols, parent=datasetParentProject
3216            )
3217            schema.id = self.existingTableId
3218            table = Table(schema, self.tableToLoad, etag=existing_results.etag)
3219            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3220            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3221            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3222            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3223        else:
3224            logging.error("Must specify a schema for table replacements")
3225
3226        # remove system metadata from manifest
3227        existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True)
3228        return self.existingTableId

Method to replace an existing table on synapse with metadata from a new manifest

Arguments:
  • specifySchema: to infer a schema for the table format
  • columnTypeDict: dictionary schema for table columns: type, size, etc
Returns:

existingTableId: synID of the already existing table that had its metadata replaced

@tracer.start_as_current_span('TableOperations::upsertTable')
def upsertTable( self, dmge: schematic.schemas.data_model_graph.DataModelGraphExplorer):
3274    @tracer.start_as_current_span("TableOperations::upsertTable")
3275    def upsertTable(self, dmge: DataModelGraphExplorer):
3276        """
3277        Method to upsert rows from a new manifest into an existing table on synapse
3278        For upsert functionality to work, primary keys must follow the naming convention of <componenet>_id
3279        `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality.
3280        Currently it is required to use -tcn "display label" with table upserts.
3281
3282
3283        Args:
3284            dmge: DataModelGraphExplorer instance
3285
3286        Returns:
3287           existingTableId: synID of the already existing table that had its metadata replaced
3288        """
3289
3290        authtoken = self._get_auth_token()
3291
3292        synapseDB = SynapseDatabase(
3293            auth_token=authtoken,
3294            project_id=self.synStore.getDatasetProject(self.datasetId),
3295            syn=self.synStore.syn,
3296            synapse_entity_tracker=self.synapse_entity_tracker,
3297        )
3298
3299        try:
3300            # Try performing upsert
3301            synapseDB.upsert_table_rows(
3302                table_name=self.tableName, data=self.tableToLoad
3303            )
3304        except SynapseHTTPError as ex:
3305            # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload
3306            if "Id is not a valid column name or id" in str(ex):
3307                self._update_table_uuid_column(dmge)
3308                synapseDB.upsert_table_rows(
3309                    table_name=self.tableName, data=self.tableToLoad
3310                )
3311            # Raise if other error
3312            else:
3313                raise ex
3314
3315        return self.existingTableId

Method to upsert rows from a new manifest into an existing table on synapse For upsert functionality to work, primary keys must follow the naming convention of _id -tm upsert should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality. Currently it is required to use -tcn "display label" with table upserts.

Arguments:
  • dmge: DataModelGraphExplorer instance
Returns:

existingTableId: synID of the already existing table that had its metadata replaced

@tracer.start_as_current_span('TableOperations::updateTable')
def updateTable(self, update_col: str = 'Id'):
3389    @tracer.start_as_current_span("TableOperations::updateTable")
3390    def updateTable(
3391        self,
3392        update_col: str = "Id",
3393    ):
3394        """
3395        Method to update an existing table with a new column
3396
3397        Args:
3398            updateCol: column to index the old and new tables on
3399
3400        Returns:
3401           existingTableId: synID of the already existing table that had its metadata replaced
3402        """
3403        existing_table, existing_results = self.synStore.get_synapse_table(
3404            self.existingTableId
3405        )
3406
3407        self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col)
3408        # store table with existing etag data and impose restrictions as appropriate
3409        table_result = self.synStore.syn.store(
3410            Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag),
3411            isRestricted=self.restrict,
3412        )
3413        # We cannot store the Table to the `synapse_entity_tracker` because there is
3414        # not `Schema` on the table object. The above `.store()` function call would
3415        # also update the ETag of the entity within Synapse. Remove it from the tracker
3416        # and re-retrieve it later on if needed again.
3417        self.synapse_entity_tracker.remove(synapse_id=table_result.tableId)
3418
3419        return self.existingTableId

Method to update an existing table with a new column

Arguments:
  • updateCol: column to index the old and new tables on
Returns:

existingTableId: synID of the already existing table that had its metadata replaced

class DatasetFileView:
3422class DatasetFileView:
3423    """Helper class to create temporary dataset file views.
3424    This class can be used in conjunction with a 'with' statement.
3425    This will ensure that the file view is deleted automatically.
3426    See SynapseStorage.getDatasetAnnotationsBatch for example usage.
3427    """
3428
3429    def __init__(
3430        self,
3431        datasetId: str,
3432        synapse: Synapse,
3433        name: str = None,
3434        temporary: bool = True,
3435        parentId: str = None,
3436    ) -> None:
3437        """Create a file view scoped to a dataset folder.
3438
3439        Args:
3440            datasetId (str): Synapse ID for a dataset folder/project.
3441            synapse (Synapse): Used for Synapse requests.
3442            name (str): Name of the file view (temporary or not).
3443            temporary (bool): Whether to delete the file view on exit
3444                of either a 'with' statement or Python entirely.
3445            parentId (str, optional): Synapse ID specifying where to
3446                store the file view. Defaults to datasetId.
3447        """
3448
3449        self.datasetId = datasetId
3450        self.synapse = synapse
3451        self.is_temporary = temporary
3452
3453        if name is None:
3454            self.name = f"schematic annotation file view for {self.datasetId}"
3455
3456        if self.is_temporary:
3457            uid = secrets.token_urlsafe(5)
3458            self.name = f"{self.name} - UID {uid}"
3459
3460        # TODO: Allow a DCC admin to configure a "universal parent"
3461        #       Such as a Synapse project writeable by everyone.
3462        self.parentId = datasetId if parentId is None else parentId
3463
3464        # TODO: Create local sharing setting to hide from everyone else
3465        view_schema = EntityViewSchema(
3466            name=self.name,
3467            parent=self.parentId,
3468            scopes=self.datasetId,
3469            includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER],
3470            addDefaultViewColumns=False,
3471            addAnnotationColumns=True,
3472        )
3473
3474        # TODO: Handle failure due to insufficient permissions by
3475        #       creating a temporary new project to store view
3476        self.view_schema = self.synapse.store(view_schema)
3477
3478        # These are filled in after calling `self.query()`
3479        self.results = None
3480        self.table = None
3481
3482        # Ensure deletion of the file view (last resort)
3483        if self.is_temporary:
3484            atexit.register(self.delete)
3485
3486    def __enter__(self):
3487        """Return file view when entering 'with' statement."""
3488        return self
3489
3490    def __exit__(self, exc_type, exc_value, traceback):
3491        """Delete file view when exiting 'with' statement."""
3492        if self.is_temporary:
3493            self.delete()
3494
3495    def delete(self):
3496        """Delete the file view on Synapse without deleting local table."""
3497        if self.view_schema is not None:
3498            self.synapse.delete(self.view_schema)
3499            self.view_schema = None
3500
3501    def query(self, tidy=True, force=False):
3502        """Retrieve file view as a data frame (raw format sans index)."""
3503        if self.table is None or force:
3504            fileview_id = self.view_schema["id"]
3505            self.results = self.synapse.tableQuery(f"select * from {fileview_id}")
3506            self.table = self.results.asDataFrame(
3507                rowIdAndVersionInIndex=False,
3508                na_values=STR_NA_VALUES_FILTERED,
3509                keep_default_na=False,
3510            )
3511        if tidy:
3512            self.tidy_table()
3513        return self.table
3514
3515    def tidy_table(self):
3516        """Convert raw file view data frame into more usable format."""
3517        assert self.table is not None, "Must call `self.query()` first."
3518        self._fix_default_columns()
3519        self._fix_list_columns()
3520        self._fix_int_columns()
3521        return self.table
3522
3523    def _fix_default_columns(self):
3524        """Rename default columns to match schematic expectations."""
3525
3526        # Drop ROW_VERSION column if present
3527        if "ROW_VERSION" in self.table:
3528            del self.table["ROW_VERSION"]
3529
3530        # Rename id column to entityId and set as data frame index
3531        if "ROW_ID" in self.table:
3532            self.table["entityId"] = "syn" + self.table["ROW_ID"].astype(str)
3533            self.table = self.table.set_index("entityId", drop=False)
3534            del self.table["ROW_ID"]
3535
3536        # Rename ROW_ETAG column to eTag and place at end of data frame
3537        if "ROW_ETAG" in self.table:
3538            row_etags = self.table.pop("ROW_ETAG")
3539
3540            # eTag column may already present if users annotated data without submitting manifest
3541            # we're only concerned with the new values and not the existing ones
3542            if "eTag" in self.table:
3543                del self.table["eTag"]
3544
3545            self.table.insert(len(self.table.columns), "eTag", row_etags)
3546
3547        return self.table
3548
3549    def _get_columns_of_type(self, types):
3550        """Helper function to get list of columns of a given type(s)."""
3551        matching_columns = []
3552        for header in self.results.headers:
3553            if header.columnType in types:
3554                matching_columns.append(header.name)
3555        return matching_columns
3556
3557    def _fix_list_columns(self):
3558        """Fix formatting of list-columns."""
3559        list_types = {"STRING_LIST", "INTEGER_LIST", "BOOLEAN_LIST"}
3560        list_columns = self._get_columns_of_type(list_types)
3561        for col in list_columns:
3562            self.table[col] = self.table[col].apply(lambda x: ", ".join(x))
3563        return self.table
3564
3565    def _fix_int_columns(self):
3566        """Ensure that integer-columns are actually integers."""
3567        int_columns = self._get_columns_of_type({"INTEGER"})
3568        for col in int_columns:
3569            # Coercing to string because NaN is a floating point value
3570            # and cannot exist alongside integers in a column
3571            def to_int_fn(x):
3572                return "" if np.isnan(x) else str(int(x))
3573
3574            self.table[col] = self.table[col].apply(to_int_fn)
3575        return self.table

Helper class to create temporary dataset file views. This class can be used in conjunction with a 'with' statement. This will ensure that the file view is deleted automatically. See SynapseStorage.getDatasetAnnotationsBatch for example usage.

DatasetFileView( datasetId: str, synapse: synapseclient.client.Synapse, name: str = None, temporary: bool = True, parentId: str = None)
3429    def __init__(
3430        self,
3431        datasetId: str,
3432        synapse: Synapse,
3433        name: str = None,
3434        temporary: bool = True,
3435        parentId: str = None,
3436    ) -> None:
3437        """Create a file view scoped to a dataset folder.
3438
3439        Args:
3440            datasetId (str): Synapse ID for a dataset folder/project.
3441            synapse (Synapse): Used for Synapse requests.
3442            name (str): Name of the file view (temporary or not).
3443            temporary (bool): Whether to delete the file view on exit
3444                of either a 'with' statement or Python entirely.
3445            parentId (str, optional): Synapse ID specifying where to
3446                store the file view. Defaults to datasetId.
3447        """
3448
3449        self.datasetId = datasetId
3450        self.synapse = synapse
3451        self.is_temporary = temporary
3452
3453        if name is None:
3454            self.name = f"schematic annotation file view for {self.datasetId}"
3455
3456        if self.is_temporary:
3457            uid = secrets.token_urlsafe(5)
3458            self.name = f"{self.name} - UID {uid}"
3459
3460        # TODO: Allow a DCC admin to configure a "universal parent"
3461        #       Such as a Synapse project writeable by everyone.
3462        self.parentId = datasetId if parentId is None else parentId
3463
3464        # TODO: Create local sharing setting to hide from everyone else
3465        view_schema = EntityViewSchema(
3466            name=self.name,
3467            parent=self.parentId,
3468            scopes=self.datasetId,
3469            includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER],
3470            addDefaultViewColumns=False,
3471            addAnnotationColumns=True,
3472        )
3473
3474        # TODO: Handle failure due to insufficient permissions by
3475        #       creating a temporary new project to store view
3476        self.view_schema = self.synapse.store(view_schema)
3477
3478        # These are filled in after calling `self.query()`
3479        self.results = None
3480        self.table = None
3481
3482        # Ensure deletion of the file view (last resort)
3483        if self.is_temporary:
3484            atexit.register(self.delete)

Create a file view scoped to a dataset folder.

Arguments:
  • datasetId (str): Synapse ID for a dataset folder/project.
  • synapse (Synapse): Used for Synapse requests.
  • name (str): Name of the file view (temporary or not).
  • temporary (bool): Whether to delete the file view on exit of either a 'with' statement or Python entirely.
  • parentId (str, optional): Synapse ID specifying where to store the file view. Defaults to datasetId.
datasetId
synapse
is_temporary
parentId
view_schema
results
table
def delete(self):
3495    def delete(self):
3496        """Delete the file view on Synapse without deleting local table."""
3497        if self.view_schema is not None:
3498            self.synapse.delete(self.view_schema)
3499            self.view_schema = None

Delete the file view on Synapse without deleting local table.

def query(self, tidy=True, force=False):
3501    def query(self, tidy=True, force=False):
3502        """Retrieve file view as a data frame (raw format sans index)."""
3503        if self.table is None or force:
3504            fileview_id = self.view_schema["id"]
3505            self.results = self.synapse.tableQuery(f"select * from {fileview_id}")
3506            self.table = self.results.asDataFrame(
3507                rowIdAndVersionInIndex=False,
3508                na_values=STR_NA_VALUES_FILTERED,
3509                keep_default_na=False,
3510            )
3511        if tidy:
3512            self.tidy_table()
3513        return self.table

Retrieve file view as a data frame (raw format sans index).

def tidy_table(self):
3515    def tidy_table(self):
3516        """Convert raw file view data frame into more usable format."""
3517        assert self.table is not None, "Must call `self.query()` first."
3518        self._fix_default_columns()
3519        self._fix_list_columns()
3520        self._fix_int_columns()
3521        return self.table

Convert raw file view data frame into more usable format.