schematic.store.synapse

Synapse storage class

View Source

   1"""Synapse storage class"""
   2
   3import asyncio
   4import atexit
   5import logging
   6import os
   7import re
   8import secrets
   9import shutil
  10import time
  11import uuid  # used to generate unique names for entities
  12from copy import deepcopy
  13from dataclasses import dataclass, field
  14from time import sleep
  15
  16# allows specifying explicit variable types
  17from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
  18
  19import numpy as np
  20import pandas as pd
  21import synapseclient
  22from opentelemetry import trace
  23from synapseclient import Annotations as OldAnnotations
  24from synapseclient import (
  25    Column,
  26    EntityViewSchema,
  27    EntityViewType,
  28    File,
  29    Folder,
  30    Schema,
  31    Synapse,
  32    Table,
  33    as_table_columns,
  34)
  35from synapseclient.annotations import _convert_to_annotations_list
  36from synapseclient.api import get_config_file, get_entity_id_bundle2
  37from synapseclient.core.constants.concrete_types import PROJECT_ENTITY
  38from synapseclient.core.exceptions import (
  39    SynapseAuthenticationError,
  40    SynapseHTTPError,
  41    SynapseUnmetAccessRestrictions,
  42)
  43from synapseclient.models.annotations import Annotations
  44from synapseclient.table import CsvFileTable, Schema, build_table
  45from tenacity import (
  46    retry,
  47    retry_if_exception_type,
  48    stop_after_attempt,
  49    wait_chain,
  50    wait_fixed,
  51)
  52
  53from schematic.configuration.configuration import CONFIG
  54from schematic.exceptions import AccessCredentialsError
  55from schematic.schemas.data_model_graph import DataModelGraphExplorer
  56from schematic.store.base import BaseStorage
  57from schematic.store.database.synapse_database import SynapseDatabase
  58from schematic.store.synapse_tracker import SynapseEntityTracker
  59from schematic.utils.df_utils import (
  60    STR_NA_VALUES_FILTERED,
  61    col_in_dataframe,
  62    load_df,
  63    update_df,
  64)
  65
  66# entity_type_mapping, get_dir_size, create_temp_folder, check_synapse_cache_size, and clear_synapse_cache functions are used for AWS deployment
  67# Please do not remove these import statements
  68from schematic.utils.general import (
  69    check_synapse_cache_size,
  70    clear_synapse_cache,
  71    create_temp_folder,
  72    entity_type_mapping,
  73    get_dir_size,
  74    create_like_statement,
  75)
  76from schematic.utils.io_utils import cleanup_temporary_storage
  77from schematic.utils.schema_utils import get_class_label_from_display_name
  78from schematic.utils.validate_utils import comma_separated_list_regex, rule_in_rule_list
  79
  80
  81logger = logging.getLogger("Synapse storage")
  82
  83tracer = trace.get_tracer("Schematic")
  84
  85ID_COLUMN = "Id"
  86ENTITY_ID_COLUMN = "entityId"
  87UUID_COLUMN = "uuid"
  88
  89
  90@dataclass
  91class ManifestDownload(object):
  92    """
  93    syn: an object of type synapseclient.
  94    manifest_id: id of a manifest
  95    synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
  96    """
  97
  98    syn: synapseclient.Synapse
  99    manifest_id: str
 100    synapse_entity_tracker: SynapseEntityTracker = field(
 101        default_factory=SynapseEntityTracker
 102    )
 103
 104    def _download_manifest_to_folder(self, use_temporary_folder: bool = True) -> File:
 105        """
 106        Try downloading a manifest to a specific folder (temporary or not). When the
 107        `use_temporary_folder` is set to True, the manifest will be downloaded to a
 108        temporary folder. This is useful for when the code is running as an API server
 109        where multiple requests are being made at the same time. This will prevent
 110        multiple requests from overwriting the same manifest file. When the
 111        `use_temporary_folder` is set to False, the manifest will be downloaded to the
 112        default manifest folder.
 113
 114        Args:
 115            use_temporary_folder: boolean argument indicating if a temporary folder
 116                should be used to store the manifest file. This is useful when running
 117                this code as an API server where multiple requests could be made at the
 118                same time. This is set to False when the code is being used from the
 119                CLI. Defaults to True.
 120
 121        Return:
 122            manifest_data: A Synapse file entity of the downloaded manifest
 123        """
 124        manifest_data = self.synapse_entity_tracker.get(
 125            synapse_id=self.manifest_id,
 126            syn=self.syn,
 127            download_file=False,
 128            retrieve_if_not_present=False,
 129        )
 130        current_span = trace.get_current_span()
 131        if (
 132            manifest_data
 133            and (file_handle := manifest_data.get("_file_handle", None))
 134            and current_span.is_recording()
 135        ):
 136            current_span.set_attribute(
 137                "schematic.manifest_size", file_handle.get("contentSize", 0)
 138            )
 139
 140        if manifest_data and manifest_data.path:
 141            return manifest_data
 142
 143        if "SECRETS_MANAGER_SECRETS" in os.environ:
 144            temporary_manifest_storage = "/var/tmp/temp_manifest_download"
 145            cleanup_temporary_storage(
 146                temporary_manifest_storage, time_delta_seconds=3600
 147            )
 148            # create a new directory to store manifest
 149            if not os.path.exists(temporary_manifest_storage):
 150                os.mkdir(temporary_manifest_storage)
 151            # create temporary folders for storing manifests
 152            download_location = create_temp_folder(
 153                path=temporary_manifest_storage,
 154                prefix=f"{self.manifest_id}-{time.time()}-",
 155            )
 156        else:
 157            if use_temporary_folder:
 158                download_location = create_temp_folder(
 159                    path=CONFIG.manifest_folder,
 160                    prefix=f"{self.manifest_id}-{time.time()}-",
 161                )
 162            else:
 163                download_location = CONFIG.manifest_folder
 164
 165        manifest_data = self.synapse_entity_tracker.get(
 166            synapse_id=self.manifest_id,
 167            syn=self.syn,
 168            download_file=True,
 169            retrieve_if_not_present=True,
 170            download_location=download_location,
 171        )
 172
 173        # This is doing a rename of the downloaded file. The reason this is important
 174        # is that if we are re-using a file that was previously downloaded, but the
 175        # file had been renamed. The file downloaded from the Synapse client is just
 176        # a direct copy of that renamed file. This code will set the name of the file
 177        # to the original name that was used to download the file. Note: An MD5 checksum
 178        # of the file will still be performed so if the file has changed, it will be
 179        # downloaded again.
 180        filename = manifest_data._file_handle.fileName
 181        if filename != os.path.basename(manifest_data.path):
 182            parent_folder = os.path.dirname(manifest_data.path)
 183            manifest_original_name_and_path = os.path.join(parent_folder, filename)
 184
 185            self.syn.cache.remove(
 186                file_handle_id=manifest_data.dataFileHandleId, path=manifest_data.path
 187            )
 188            os.rename(manifest_data.path, manifest_original_name_and_path)
 189            manifest_data.path = manifest_original_name_and_path
 190            self.syn.cache.add(
 191                file_handle_id=manifest_data.dataFileHandleId,
 192                path=manifest_original_name_and_path,
 193                md5=manifest_data._file_handle.contentMd5,
 194            )
 195
 196        return manifest_data
 197
 198    def _entity_type_checking(self) -> str:
 199        """
 200        check the entity type of the id that needs to be downloaded
 201        Return:
 202             if the entity type is wrong, raise an error
 203        """
 204        # check the type of entity
 205        entity_type = entity_type_mapping(
 206            syn=self.syn,
 207            entity_id=self.manifest_id,
 208            synapse_entity_tracker=self.synapse_entity_tracker,
 209        )
 210        if entity_type != "file":
 211            logger.error(
 212                f"You are using entity type: {entity_type}. Please provide a file ID"
 213            )
 214
 215    def download_manifest(
 216        self,
 217        newManifestName: str = "",
 218        manifest_df: pd.DataFrame = pd.DataFrame(),
 219        use_temporary_folder: bool = True,
 220    ) -> Union[str, File]:
 221        """
 222        Download a manifest based on a given manifest id.
 223        Args:
 224            newManifestName(optional): new name of a manifest that gets downloaded.
 225            manifest_df(optional): a dataframe containing name and id of manifests in a given asset view
 226        Return:
 227            manifest_data: synapse entity file object
 228        """
 229
 230        # enables retrying if user does not have access to uncensored manifest
 231        # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location
 232        manifest_data = ""
 233
 234        # check entity type
 235        self._entity_type_checking()
 236
 237        # download a manifest
 238        try:
 239            manifest_data = self._download_manifest_to_folder(
 240                use_temporary_folder=use_temporary_folder
 241            )
 242        except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError):
 243            # if there's an error getting an uncensored manifest, try getting the censored manifest
 244            if not manifest_df.empty:
 245                censored_regex = re.compile(".*censored.*")
 246                censored = manifest_df["name"].str.contains(censored_regex)
 247                new_manifest_id = manifest_df[censored]["id"][0]
 248                self.manifest_id = new_manifest_id
 249                try:
 250                    manifest_data = self._download_manifest_to_folder(
 251                        use_temporary_folder=use_temporary_folder
 252                    )
 253                except (
 254                    SynapseUnmetAccessRestrictions,
 255                    SynapseAuthenticationError,
 256                ) as e:
 257                    raise PermissionError(
 258                        "You don't have access to censored and uncensored manifests in this dataset."
 259                    ) from e
 260            else:
 261                logger.error(
 262                    f"You don't have access to the requested resource: {self.manifest_id}"
 263                )
 264
 265        if newManifestName and os.path.exists(manifest_data.get("path")):
 266            # Rename the file we just made to the new name
 267            new_manifest_filename = newManifestName + ".csv"
 268
 269            # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest.
 270            parent_folder = os.path.dirname(manifest_data.get("path"))
 271
 272            new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename)
 273
 274            # Copy file to new location. The purpose of using a copy instead of a rename
 275            # is to avoid any potential issues with the file being used in another
 276            # process. This avoids any potential race or code cocurrency conditions.
 277            shutil.copyfile(src=manifest_data["path"], dst=new_manifest_path_name)
 278
 279            # Adding this to cache will allow us to re-use the already downloaded
 280            # manifest file for up to 1 hour.
 281            self.syn.cache.add(
 282                file_handle_id=manifest_data.dataFileHandleId,
 283                path=new_manifest_path_name,
 284                md5=manifest_data._file_handle.contentMd5,
 285            )
 286
 287            # Update file names/paths in manifest_data
 288            manifest_data["name"] = new_manifest_filename
 289            manifest_data["filename"] = new_manifest_filename
 290            manifest_data["path"] = new_manifest_path_name
 291
 292        return manifest_data
 293
 294
 295class SynapseStorage(BaseStorage):
 296    """Implementation of Storage interface for datasets/files stored on Synapse.
 297    Provides utilities to list files in a specific project; update files annotations, create fileviews, etc.
 298
 299    TODO: Need to define the interface and rename and/or refactor some of the methods below.
 300    """
 301
 302    @tracer.start_as_current_span("SynapseStorage::__init__")
 303    def __init__(
 304        self,
 305        token: Optional[str] = None,  # optional parameter retrieved from browser cookie
 306        access_token: Optional[str] = None,
 307        project_scope: Optional[list] = None,
 308        synapse_cache_path: Optional[str] = None,
 309        perform_query: Optional[bool] = True,
 310        columns: Optional[list] = None,
 311        where_clauses: Optional[list] = None,
 312    ) -> None:
 313        """Initializes a SynapseStorage object.
 314
 315        Args:
 316            token (Optional[str], optional):
 317              Optional token parameter as found in browser cookie upon login to synapse.
 318              Defaults to None.
 319            access_token (Optional[list], optional):
 320              Optional access token (personal or oauth).
 321              Defaults to None.
 322            project_scope (Optional[list], optional): Defaults to None.
 323            synapse_cache_path (Optional[str], optional):
 324              Location of synapse cache.
 325              Defaults to None.
 326        TODO:
 327            Consider necessity of adding "columns" and "where_clauses" params to the constructor. Currently with how `query_fileview` is implemented, these params are not needed at this step but could be useful in the future if the need for more scoped querys expands.
 328        """
 329        self.syn = self.login(synapse_cache_path, access_token)
 330        self.project_scope = project_scope
 331        self.storageFileview = CONFIG.synapse_master_fileview_id
 332        self.manifest = CONFIG.synapse_manifest_basename
 333        self.root_synapse_cache = self.syn.cache.cache_root_dir
 334        self.synapse_entity_tracker = SynapseEntityTracker()
 335        if perform_query:
 336            self.query_fileview(columns=columns, where_clauses=where_clauses)
 337
 338    # TODO: When moving this over to a regular cron-job the following logic should be
 339    # out of `manifest_download`:
 340    # if "SECRETS_MANAGER_SECRETS" in os.environ:
 341    #     temporary_manifest_storage = "/var/tmp/temp_manifest_download"
 342    #     cleanup_temporary_storage(temporary_manifest_storage, time_delta_seconds=3600)
 343    @tracer.start_as_current_span("SynapseStorage::_purge_synapse_cache")
 344    def _purge_synapse_cache(
 345        self, maximum_storage_allowed_cache_gb: int = 1, minute_buffer: int = 15
 346    ) -> None:
 347        """
 348        Purge synapse cache if it exceeds a certain size. Default to 1GB.
 349        Args:
 350            maximum_storage_allowed_cache_gb (int): the maximum storage allowed
 351              before purging cache. Default is 1 GB.
 352            minute_buffer (int): All files created this amount of time or older will be deleted
 353        """
 354        # try clearing the cache
 355        # scan a directory and check size of files
 356        if os.path.exists(self.root_synapse_cache):
 357            maximum_storage_allowed_cache_bytes = maximum_storage_allowed_cache_gb * (
 358                1024**3
 359            )
 360            nbytes = get_dir_size(self.root_synapse_cache)
 361            dir_size_bytes = check_synapse_cache_size(directory=self.root_synapse_cache)
 362            # if 1 GB has already been taken, purge cache before 15 min
 363            if dir_size_bytes >= maximum_storage_allowed_cache_bytes:
 364                num_of_deleted_files = clear_synapse_cache(
 365                    self.syn.cache, minutes=minute_buffer
 366                )
 367                logger.info(
 368                    f"{num_of_deleted_files}  files have been deleted from {self.root_synapse_cache}"
 369                )
 370            else:
 371                # on AWS, OS takes around 14-17% of our ephemeral storage (20GiB)
 372                # instead of guessing how much space that we left, print out .synapseCache here
 373                logger.info(f"the total size of .synapseCache is: {nbytes} bytes")
 374
 375    @tracer.start_as_current_span("SynapseStorage::query_fileview")
 376    def query_fileview(
 377        self,
 378        columns: Optional[list] = None,
 379        where_clauses: Optional[list] = None,
 380        force_requery: Optional[bool] = False,
 381    ) -> None:
 382        """
 383        Method to query the Synapse FileView and store the results in a pandas DataFrame. The results are stored in the storageFileviewTable attribute.
 384        Is called once during initialization of the SynapseStorage object and can be called again later to specify a specific, more limited scope for validation purposes.
 385        Args:
 386            columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
 387            where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
 388            force_requery (Optional[bool], optional): If True, forces a requery of the fileview. Defaults to False.
 389        """
 390        self._purge_synapse_cache()
 391
 392        # Initialize to assume that the new fileview query will be different from what may already be stored. Initializes to True because generally one will not have already been performed
 393        self.new_query_different = True
 394
 395        # If a query has already been performed, store the query
 396        previous_query_built = hasattr(self, "fileview_query")
 397        if previous_query_built:
 398            previous_query = self.fileview_query
 399
 400        # Build a query with the current given parameters and check to see if it is different from the previous
 401        self._build_query(columns=columns, where_clauses=where_clauses)
 402        if previous_query_built:
 403            self.new_query_different = self.fileview_query != previous_query
 404
 405        # Only perform the query if it is different from the previous query or we are forcing new results to be retrieved
 406        if self.new_query_different or force_requery:
 407            try:
 408                self.storageFileviewTable = self.syn.tableQuery(
 409                    query=self.fileview_query,
 410                ).asDataFrame(na_values=STR_NA_VALUES_FILTERED, keep_default_na=False)
 411            except SynapseHTTPError as exc:
 412                exception_text = str(exc)
 413                if "Unknown column path" in exception_text:
 414                    raise ValueError(
 415                        "The path column has not been added to the fileview. Please make sure that the fileview is up to date. You can add the path column to the fileview by follwing the instructions in the validation rules documentation."
 416                    )
 417                elif "Unknown column" in exception_text:
 418                    missing_column = exception_text.split("Unknown column ")[-1]
 419                    raise ValueError(
 420                        f"The columns {missing_column} specified in the query do not exist in the fileview. Please make sure that the column names are correct and that all expected columns have been added to the fileview."
 421                    )
 422                else:
 423                    raise AccessCredentialsError(self.storageFileview)
 424
 425    @staticmethod
 426    def build_clause_from_dataset_id(
 427        dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None
 428    ) -> str:
 429        """
 430        Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized.
 431        Args:
 432            dataset_id: Synapse ID of a dataset that should be used to limit the query
 433            dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query
 434        Returns:
 435            clause for the query or an empty string if no dataset ID is provided
 436        """
 437        # Calling this method without specifying synIDs will complete but will not scope the view
 438        if (not dataset_id) and (not dataset_folder_list):
 439            return ""
 440
 441        # This will be used to gather files under a dataset recursively with a fileview query instead of walking
 442        if dataset_folder_list:
 443            search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list)
 444            return f"parentId IN ({search_folders})"
 445
 446        # `dataset_id` should be provided when all files are stored directly under the dataset folder
 447        return f"parentId='{dataset_id}'"
 448
 449    def _build_query(
 450        self, columns: Optional[list] = None, where_clauses: Optional[list] = None
 451    ):
 452        """
 453        Method to build a query for Synapse FileViews
 454        Args:
 455            columns (Optional[list], optional): List of columns to be selected from the table. Defaults behavior is to request all columns.
 456            where_clauses (Optional[list], optional): List of where clauses to be used to scope the query. Defaults to None.
 457            self.storageFileview (str): Synapse FileView ID
 458            self.project_scope (Optional[list], optional): List of project IDs to be used to scope the query. Defaults to None.
 459                Gets added to where_clauses, more included for backwards compatability and as a more user friendly way of subsetting the view in a simple way.
 460        """
 461        if columns is None:
 462            columns = []
 463        if where_clauses is None:
 464            where_clauses = []
 465
 466        if self.project_scope:
 467            project_scope_clause = f"projectId IN {tuple(self.project_scope + [''])}"
 468            where_clauses.append(project_scope_clause)
 469
 470        if where_clauses:
 471            where_clauses = " AND ".join(where_clauses)
 472            where_clauses = f"WHERE {where_clauses} ;"
 473        else:
 474            where_clauses = ";"
 475
 476        if columns:
 477            columns = ",".join(columns)
 478        else:
 479            columns = "*"
 480
 481        self.fileview_query = (
 482            f"SELECT {columns} FROM {self.storageFileview} {where_clauses}"
 483        )
 484
 485        return
 486
 487    @staticmethod
 488    @tracer.start_as_current_span("SynapseStorage::login")
 489    def login(
 490        synapse_cache_path: Optional[str] = None,
 491        access_token: Optional[str] = None,
 492    ) -> synapseclient.Synapse:
 493        """Login to Synapse
 494
 495        Args:
 496            access_token (Optional[str], optional): A synapse access token. Defaults to None.
 497            synapse_cache_path (Optional[str]): location of synapse cache
 498
 499        Raises:
 500            ValueError: If unable to loging with access token
 501
 502        Returns:
 503            synapseclient.Synapse: A Synapse object that is logged in
 504        """
 505        if not access_token:
 506            access_token = os.getenv("SYNAPSE_ACCESS_TOKEN")
 507
 508        # login using a token
 509        if access_token:
 510            try:
 511                syn = synapseclient.Synapse(
 512                    cache_root_dir=synapse_cache_path,
 513                    debug=False,
 514                    skip_checks=True,
 515                    cache_client=False,
 516                )
 517                syn.login(authToken=access_token, silent=True)
 518            except SynapseHTTPError as exc:
 519                raise ValueError(
 520                    "No access to resources. Please make sure that your token is correct"
 521                ) from exc
 522        else:
 523            # login using synapse credentials provided by user in .synapseConfig (default) file
 524            syn = synapseclient.Synapse(
 525                configPath=CONFIG.synapse_configuration_path,
 526                cache_root_dir=synapse_cache_path,
 527                debug=False,
 528                skip_checks=True,
 529                cache_client=False,
 530            )
 531            syn.login(silent=True)
 532
 533        # set user id attribute
 534        current_span = trace.get_current_span()
 535        if current_span.is_recording():
 536            current_span.set_attribute("user.id", syn.credentials.owner_id)
 537
 538        return syn
 539
 540    def missing_entity_handler(method):
 541        def wrapper(*args, **kwargs):
 542            try:
 543                return method(*args, **kwargs)
 544            except SynapseHTTPError as ex:
 545                str_message = str(ex).replace("\n", "")
 546                if "trash" in str_message or "does not exist" in str_message:
 547                    logging.warning(str_message)
 548                    return None
 549                else:
 550                    raise ex
 551
 552        return wrapper
 553
 554    def async_missing_entity_handler(method):
 555        """Decorator to handle missing entities in async methods."""
 556
 557        async def wrapper(*args: Any, **kwargs: Any) -> Any:
 558            try:
 559                return await method(*args, **kwargs)
 560            except SynapseHTTPError as ex:
 561                str_message = str(ex).replace("\n", "")
 562                if "trash" in str_message or "does not exist" in str_message:
 563                    logging.warning(str_message)
 564                    return None
 565                else:
 566                    raise ex
 567
 568        return wrapper
 569
 570    def getStorageFileviewTable(self):
 571        """Returns the storageFileviewTable obtained during initialization."""
 572        return self.storageFileviewTable
 573
 574    def getPaginatedRestResults(self, currentUserId: str) -> Dict[str, str]:
 575        """Gets the paginated results of the REST call to Synapse to check what projects the current user has access to.
 576
 577        Args:
 578            currentUserId: synapse id for the user whose projects we want to get.
 579
 580        Returns:
 581            A dictionary with a next page token and the results.
 582        """
 583        all_results = self.syn.restGET(
 584            "/projects/user/{principalId}".format(principalId=currentUserId)
 585        )
 586
 587        while (
 588            "nextPageToken" in all_results
 589        ):  # iterate over next page token in results while there is any
 590            results_token = self.syn.restGET(
 591                "/projects/user/{principalId}?nextPageToken={nextPageToken}".format(
 592                    principalId=currentUserId,
 593                    nextPageToken=all_results["nextPageToken"],
 594                )
 595            )
 596            all_results["results"].extend(results_token["results"])
 597
 598            if "nextPageToken" in results_token:
 599                all_results["nextPageToken"] = results_token["nextPageToken"]
 600            else:
 601                del all_results["nextPageToken"]
 602
 603        return all_results
 604
 605    @tracer.start_as_current_span("SynapseStorage::getStorageProjects")
 606    def getStorageProjects(self, project_scope: List = None) -> list[tuple[str, str]]:
 607        """Gets all storage projects the current user has access to, within the scope of the 'storageFileview' attribute.
 608
 609        Returns:
 610            A list of storage projects the current user has access to; the list consists of tuples (projectId, projectName).
 611        """
 612
 613        # get the set of all storage Synapse project accessible for this pipeline
 614        storageProjects = self.storageFileviewTable["projectId"].unique()
 615
 616        # get the set of storage Synapse project accessible for this user
 617        # get a list of projects from Synapse
 618        current_user_project_headers = self.synapse_entity_tracker.get_project_headers(
 619            current_user_id=self.syn.credentials.owner_id, syn=self.syn
 620        )
 621        project_id_to_name_dict = {}
 622        current_user_projects = []
 623        for project_header in current_user_project_headers:
 624            project_id_to_name_dict[project_header.get("id")] = project_header.get(
 625                "name"
 626            )
 627            current_user_projects.append(project_header.get("id"))
 628
 629        # find set of user projects that are also in this pipeline's storage projects set
 630        storageProjects = list(set(storageProjects) & set(current_user_projects))
 631
 632        # Limit projects to scope if specified
 633        if project_scope:
 634            storageProjects = list(set(storageProjects) & set(project_scope))
 635
 636            if not storageProjects:
 637                raise Warning(
 638                    f"There are no projects that the user has access to that match the criteria of the specified project scope: {project_scope}"
 639                )
 640
 641        # prepare a return list of project IDs and names
 642        projects = []
 643        for projectId in storageProjects:
 644            project_name_from_project_header = project_id_to_name_dict.get(projectId)
 645            projects.append((projectId, project_name_from_project_header))
 646
 647        sorted_projects_list = sorted(projects, key=lambda tup: tup[0])
 648
 649        return sorted_projects_list
 650
 651    @tracer.start_as_current_span("SynapseStorage::getStorageDatasetsInProject")
 652    def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]:
 653        """Gets all datasets in folder under a given storage project that the current user has access to.
 654
 655        Args:
 656            projectId: synapse ID of a storage project.
 657
 658        Returns:
 659            A list of datasets within the given storage project; the list consists of tuples (datasetId, datasetName).
 660            None: If the projectId cannot be found on Synapse.
 661        """
 662
 663        # select all folders and fetch their names from within the storage project;
 664        # if folder content type is defined, only select folders that contain datasets
 665        if "contentType" in self.storageFileviewTable.columns:
 666            foldersTable = self.storageFileviewTable[
 667                (self.storageFileviewTable["contentType"] == "dataset")
 668                & (self.storageFileviewTable["projectId"] == projectId)
 669            ]
 670        else:
 671            foldersTable = self.storageFileviewTable[
 672                (self.storageFileviewTable["type"] == "folder")
 673                & (self.storageFileviewTable["parentId"] == projectId)
 674            ]
 675
 676        # get an array of tuples (folderId, folderName)
 677        # some folders are part of datasets; others contain datasets
 678        # each dataset parent is the project; folders part of a dataset have another folder as a parent
 679        # to get folders if and only if they contain datasets for each folder
 680        # check if folder's parent is the project; if so that folder contains a dataset,
 681        # unless the folder list has already been filtered to dataset folders based on contentType attribute above
 682
 683        datasetList = []
 684        folderProperties = ["id", "name"]
 685        for folder in list(
 686            foldersTable[folderProperties].itertuples(index=False, name=None)
 687        ):
 688            datasetList.append(folder)
 689
 690        sorted_dataset_list = sorted(datasetList, key=lambda tup: tup[0])
 691
 692        return sorted_dataset_list
 693
 694    @tracer.start_as_current_span("SynapseStorage::getFilesInStorageDataset")
 695    def getFilesInStorageDataset(
 696        self, datasetId: str, fileNames: List = None, fullpath: bool = True
 697    ) -> List[Tuple[str, str]]:
 698        """Gets all files (excluding manifest files) in a given dataset folder.
 699
 700        Args:
 701            datasetId: synapse ID of a storage dataset.
 702            fileNames: get a list of files with particular names; defaults to None in which case all dataset files are returned (except bookkeeping files, e.g.
 703            metadata manifests); if fileNames is not None, all files matching the names in the fileNames list are returned if present.
 704            fullpath: if True return the full path as part of this filename; otherwise return just base filename
 705
 706        Returns:
 707            A list of files; the list consists of tuples (fileId, fileName).
 708
 709        Raises:
 710            ValueError: Dataset ID not found.
 711        """
 712        file_list = []
 713
 714        # Get path to dataset folder by using childern to avoid cases where the dataset is the scope of the view
 715        if self.storageFileviewTable.empty:
 716            raise ValueError(
 717                f"Fileview {self.storageFileview} is empty, please check the table and the provided synID and try again."
 718            )
 719        child_path = self.storageFileviewTable.loc[
 720            self.storageFileviewTable["parentId"] == datasetId, "path"
 721        ]
 722        if child_path.empty:
 723            raise LookupError(
 724                f"Dataset {datasetId} could not be found in fileview {self.storageFileview}."
 725            )
 726        child_path = child_path.iloc[0]
 727
 728        # Get the dataset path by eliminating the child's portion of the path to account for nested datasets
 729        parent = child_path.split("/")[:-1]
 730        parent = "/".join(parent)
 731
 732        # When querying, only include files to exclude entity files and subdirectories
 733        where_clauses = [create_like_statement(parent), "type='file'"]
 734
 735        # Requery the fileview to specifically get the files in the given dataset
 736        self.query_fileview(columns=["id", "path"], where_clauses=where_clauses)
 737
 738        # Exclude manifest files
 739        non_manifest_files = self.storageFileviewTable.loc[
 740            ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"),
 741            :,
 742        ]
 743
 744        # Remove all files that are not in the list of fileNames
 745        if fileNames:
 746            filename_regex = "|".join(fileNames)
 747
 748            matching_files = non_manifest_files["path"].str.contains(
 749                filename_regex, case=False, regex=True
 750            )
 751
 752            non_manifest_files = non_manifest_files.loc[matching_files, :]
 753
 754        # Truncate path if necessary
 755        if not fullpath:
 756            non_manifest_files.path = non_manifest_files.path.apply(os.path.basename)
 757
 758        # Return list of files as expected by other methods
 759        file_list = list(non_manifest_files.itertuples(index=False, name=None))
 760
 761        return file_list
 762
 763    def _get_manifest_id(self, manifest: pd.DataFrame) -> str:
 764        """If both censored and uncensored manifests are present, return uncensored manifest; if only one manifest is present, return manifest id of that manifest; if more than two manifests are present, return the manifest id of the first one.
 765        Args:
 766        manifest: a dataframe contains name and id of manifests in a given asset view
 767
 768        Return:
 769        manifest_syn_id: id of a given censored or uncensored manifest
 770        """
 771        censored_regex = re.compile(".*censored.*")
 772        censored = manifest["name"].str.contains(censored_regex)
 773        if any(censored):
 774            # Try to use uncensored manifest first
 775            not_censored = ~censored
 776            if any(not_censored):
 777                manifest_syn_id = manifest[not_censored]["id"].iloc[0]
 778            # if only censored manifests are available, just use the first censored manifest
 779            else:
 780                manifest_syn_id = manifest["id"].iloc[0]
 781
 782        # otherwise, use the first (implied only) version that exists
 783        else:
 784            manifest_syn_id = manifest["id"].iloc[0]
 785
 786        return manifest_syn_id
 787
 788    @tracer.start_as_current_span("SynapseStorage::getDatasetManifest")
 789    def getDatasetManifest(
 790        self,
 791        datasetId: str,
 792        downloadFile: bool = False,
 793        newManifestName: str = "",
 794        use_temporary_folder: bool = True,
 795    ) -> Union[str, File]:
 796        """Gets the manifest associated with a given dataset.
 797
 798        Args:
 799            datasetId: synapse ID of a storage dataset.
 800            downloadFile: boolean argument indicating if manifest file in dataset should be downloaded or not.
 801            newManifestName: new name of a manifest that gets downloaded
 802            use_temporary_folder: boolean argument indicating if a temporary folder
 803                should be used to store the manifest file. This is useful when running
 804                this code as an API server where multiple requests could be made at the
 805                same time. This is set to False when the code is being used from the
 806                CLI. Defaults to True.
 807
 808        Returns:
 809            manifest_syn_id (String): Synapse ID of exisiting manifest file.
 810            manifest_data (synapseclient.entity.File): Synapse entity if downloadFile is True.
 811            "" (String): No pre-exisiting manifest in dataset.
 812        """
 813        manifest_data = ""
 814
 815        # get a list of files containing the manifest for this dataset (if any)
 816        all_files = self.storageFileviewTable
 817
 818        # construct regex based on manifest basename in the config
 819        manifest_re = re.compile(os.path.basename(self.manifest) + ".*.[tc]sv")
 820
 821        # search manifest based on given manifest basename regex above
 822        # and return a dataframe containing name and id of manifests in a given asset view
 823        manifest = all_files[
 824            (all_files["name"].str.contains(manifest_re, regex=True))
 825            & (all_files["parentId"] == datasetId)
 826        ]
 827
 828        manifest = manifest[["id", "name"]]
 829
 830        # if there is no pre-exisiting manifest in the specified dataset
 831        if manifest.empty:
 832            logger.warning(
 833                f"Could not find a manifest that fits basename {self.manifest} in asset view and dataset {datasetId}"
 834            )
 835            return ""
 836
 837        # if there is an exisiting manifest
 838        else:
 839            manifest_syn_id = self._get_manifest_id(manifest)
 840            if downloadFile:
 841                md = ManifestDownload(
 842                    self.syn,
 843                    manifest_id=manifest_syn_id,
 844                    synapse_entity_tracker=self.synapse_entity_tracker,
 845                )
 846                manifest_data = md.download_manifest(
 847                    newManifestName=newManifestName,
 848                    manifest_df=manifest,
 849                    use_temporary_folder=use_temporary_folder,
 850                )
 851                # TO DO: revisit how downstream code handle manifest_data. If the downstream code would break when manifest_data is an empty string,
 852                # then we should catch the error here without returning an empty string.
 853                if not manifest_data:
 854                    logger.debug(
 855                        f"No manifest data returned. Please check if you have successfully downloaded manifest: {manifest_syn_id}"
 856                    )
 857                return manifest_data
 858            return manifest_syn_id
 859
 860    def getDataTypeFromManifest(self, manifestId: str):
 861        """Fetch a manifest and return data types of all columns
 862        Args:
 863            manifestId: synapse ID of a manifest
 864        """
 865        # get manifest file path
 866        manifest_entity = self.synapse_entity_tracker.get(
 867            synapse_id=manifestId, syn=self.syn, download_file=True
 868        )
 869        manifest_filepath = manifest_entity.path
 870
 871        # load manifest dataframe
 872        manifest = load_df(
 873            manifest_filepath,
 874            preserve_raw_input=False,
 875            data_model=False,
 876        )
 877
 878        # convert the dataFrame to use best possible dtypes.
 879        manifest_new = manifest.convert_dtypes()
 880
 881        # get data types of columns
 882        result = manifest_new.dtypes.to_frame("dtypes").reset_index()
 883
 884        # return the result as a dictionary
 885        result_dict = result.set_index("index")["dtypes"].astype(str).to_dict()
 886
 887        return result_dict
 888
 889    def _get_files_metadata_from_dataset(
 890        self, datasetId: str, only_new_files: bool, manifest: pd.DataFrame = None
 891    ) -> Optional[dict]:
 892        """retrieve file ids under a particular datasetId
 893
 894        Args:
 895            datasetId (str): a dataset id
 896            only_new_files (bool): if only adding new files that are not already exist
 897            manifest (pd.DataFrame): metadata manifest dataframe. Default to None.
 898
 899        Returns:
 900            a dictionary that contains filename and entityid under a given datasetId or None if there is nothing under a given dataset id are not available
 901        """
 902        dataset_files = self.getFilesInStorageDataset(datasetId)
 903        if dataset_files:
 904            dataset_file_names_id_dict = self._get_file_entityIds(
 905                dataset_files, only_new_files=only_new_files, manifest=manifest
 906            )
 907            return dataset_file_names_id_dict
 908        else:
 909            return None
 910
 911    def add_entity_id_and_filename(
 912        self, datasetId: str, manifest: pd.DataFrame
 913    ) -> pd.DataFrame:
 914        """add entityid and filename column to an existing manifest assuming entityId column is not already present
 915
 916        Args:
 917            datasetId (str): dataset syn id
 918            manifest (pd.DataFrame): existing manifest dataframe, assuming this dataframe does not have an entityId column and Filename column is present but completely empty
 919
 920        Returns:
 921            pd.DataFrame: returns a pandas dataframe
 922        """
 923        # get file names and entity ids of a given dataset
 924        dataset_files_dict = self._get_files_metadata_from_dataset(
 925            datasetId, only_new_files=False
 926        )
 927
 928        if dataset_files_dict:
 929            # turn manifest dataframe back to a dictionary for operation
 930            manifest_dict = manifest.to_dict("list")
 931
 932            # update Filename column
 933            # add entityId column to the end
 934            manifest_dict.update(dataset_files_dict)
 935
 936            # if the component column exists in existing manifest, fill up that column
 937            if "Component" in manifest_dict.keys():
 938                manifest_dict["Component"] = manifest_dict["Component"] * max(
 939                    1, len(manifest_dict["Filename"])
 940                )
 941
 942            # turn dictionary back to a dataframe
 943            manifest_df_index = pd.DataFrame.from_dict(manifest_dict, orient="index")
 944            manifest_df_updated = manifest_df_index.transpose()
 945
 946            # fill na with empty string
 947            manifest_df_updated = manifest_df_updated.fillna("")
 948
 949            # drop index
 950            manifest_df_updated = manifest_df_updated.reset_index(drop=True)
 951
 952            return manifest_df_updated
 953        else:
 954            return manifest
 955
 956    def fill_in_entity_id_filename(
 957        self, datasetId: str, manifest: pd.DataFrame
 958    ) -> Tuple[List, pd.DataFrame]:
 959        """fill in Filename column and EntityId column. EntityId column and Filename column will be created if not already present.
 960
 961        Args:
 962            datasetId (str): dataset syn id
 963            manifest (pd.DataFrame): existing manifest dataframe.
 964
 965        Returns:
 966            Tuple[List, pd.DataFrame]: a list of synIds that are under a given datasetId folder and updated manifest dataframe
 967        """
 968        # get dataset file names and entity id as a list of tuple
 969        dataset_files = self.getFilesInStorageDataset(datasetId)
 970
 971        # update manifest with additional filenames, if any
 972        # note that if there is an existing manifest and there are files in the dataset
 973        # the columns Filename and entityId are assumed to be present in manifest schema
 974        # TODO: use idiomatic panda syntax
 975        if not dataset_files:
 976            manifest = manifest.fillna("")
 977            return dataset_files, manifest
 978
 979        all_files = self._get_file_entityIds(
 980            dataset_files=dataset_files, only_new_files=False, manifest=manifest
 981        )
 982        new_files = self._get_file_entityIds(
 983            dataset_files=dataset_files, only_new_files=True, manifest=manifest
 984        )
 985
 986        all_files = pd.DataFrame(all_files)
 987        new_files = pd.DataFrame(new_files)
 988
 989        # update manifest so that it contains new dataset files
 990        manifest = (
 991            pd.concat([manifest, new_files], sort=False)
 992            .reset_index()
 993            .drop("index", axis=1)
 994        )
 995
 996        # Reindex manifest and new files dataframes according to entityIds to align file paths and metadata
 997        manifest_reindex = manifest.set_index("entityId")
 998        all_files_reindex = all_files.set_index("entityId")
 999        all_files_reindex_like_manifest = all_files_reindex.reindex_like(
1000            manifest_reindex
1001        )
1002
1003        # Check if individual file paths in manifest and from synapse match
1004        file_paths_match = (
1005            manifest_reindex["Filename"] == all_files_reindex_like_manifest["Filename"]
1006        )
1007
1008        # If all the paths do not match, update the manifest with the filepaths from synapse
1009        if not file_paths_match.all():
1010            manifest_reindex.loc[
1011                ~file_paths_match, "Filename"
1012            ] = all_files_reindex_like_manifest.loc[~file_paths_match, "Filename"]
1013
1014            # reformat manifest for further use
1015            manifest = manifest_reindex.reset_index()
1016            entityIdCol = manifest.pop("entityId")
1017            manifest.insert(len(manifest.columns), "entityId", entityIdCol)
1018
1019        manifest = manifest.fillna("")
1020        return dataset_files, manifest
1021
1022    @tracer.start_as_current_span("SynapseStorage::updateDatasetManifestFiles")
1023    def updateDatasetManifestFiles(
1024        self, dmge: DataModelGraphExplorer, datasetId: str, store: bool = True
1025    ) -> Union[Tuple[str, pd.DataFrame], None]:
1026        """Fetch the names and entity IDs of all current files in dataset in store, if any; update dataset's manifest with new files, if any.
1027
1028        Args:
1029            dmge: DataModelGraphExplorer Instance
1030            datasetId: synapse ID of a storage dataset.
1031            store: if set to True store updated manifest in asset store; if set to False
1032            return a Pandas dataframe containing updated manifest but do not store to asset store
1033
1034
1035        Returns:
1036            Synapse ID of updated manifest and Pandas dataframe containing the updated manifest.
1037            If there is no existing manifest or if the manifest does not have an entityId column, return None
1038        """
1039
1040        # get existing manifest Synapse ID
1041        manifest_id = self.getDatasetManifest(datasetId)
1042
1043        # if there is no manifest return None
1044        if not manifest_id:
1045            return None
1046
1047        manifest_entity = self.synapse_entity_tracker.get(
1048            synapse_id=manifest_id, syn=self.syn, download_file=True
1049        )
1050        manifest_filepath = manifest_entity.path
1051        manifest = load_df(manifest_filepath)
1052
1053        # If the manifest does not have an entityId column, trigger a new manifest to be generated
1054        if "entityId" not in manifest.columns:
1055            return None
1056
1057        manifest_is_file_based = "Filename" in manifest.columns
1058
1059        if manifest_is_file_based:
1060            # update manifest with additional filenames, if any
1061            # note that if there is an existing manifest and there are files in the dataset
1062            # the columns Filename and entityId are assumed to be present in manifest schema
1063            # TODO: use idiomatic panda syntax
1064            dataset_files, manifest = self.fill_in_entity_id_filename(
1065                datasetId, manifest
1066            )
1067            if dataset_files:
1068                # update the manifest file, so that it contains the relevant entity IDs
1069                if store:
1070                    manifest.to_csv(manifest_filepath, index=False)
1071
1072                    # store manifest and update associated metadata with manifest on Synapse
1073                    manifest_id = self.associateMetadataWithFiles(
1074                        dmge, manifest_filepath, datasetId
1075                    )
1076
1077        return manifest_id, manifest
1078
1079    def _get_file_entityIds(
1080        self,
1081        dataset_files: List,
1082        only_new_files: bool = False,
1083        manifest: pd.DataFrame = None,
1084    ):
1085        """
1086        Get a dictionary of files in a dataset. Either files that are not in the current manifest or all files
1087
1088        Args:
1089            manifest: metadata manifest
1090            dataset_file: List of all files in a dataset
1091            only_new_files: boolean to control whether only new files are returned or all files in the dataset
1092        Returns:
1093            files: dictionary of file names and entityIDs, with scope as specified by `only_new_files`
1094        """
1095        files = {"Filename": [], "entityId": []}
1096
1097        if only_new_files:
1098            if manifest is None:
1099                raise UnboundLocalError(
1100                    "No manifest was passed in, a manifest is required when `only_new_files` is True."
1101                )
1102
1103            if "entityId" not in manifest.columns:
1104                raise ValueError(
1105                    "The manifest in your dataset and/or top level folder must contain the 'entityId' column. "
1106                    "Please generate an empty manifest without annotations, manually add annotations to the "
1107                    "appropriate files in the manifest, and then try again."
1108                )
1109
1110            # find new files (that are not in the current manifest) if any
1111            for file_id, file_name in dataset_files:
1112                if not file_id in manifest["entityId"].values:
1113                    files["Filename"].append(file_name)
1114                    files["entityId"].append(file_id)
1115        else:
1116            # get all files
1117            for file_id, file_name in dataset_files:
1118                files["Filename"].append(file_name)
1119                files["entityId"].append(file_id)
1120
1121        return files
1122
1123    @tracer.start_as_current_span("SynapseStorage::getProjectManifests")
1124    def getProjectManifests(
1125        self, projectId: str
1126    ) -> list[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]:
1127        """Gets all metadata manifest files across all datasets in a specified project.
1128
1129        Returns: A list of datasets per project; metadata manifest Synapse ID for each dataset; and the corresponding schema component of the manifest
1130                 as a list of tuples, one for each manifest:
1131                    [
1132                        (
1133                            (datasetId, dataName),
1134                            (manifestId, manifestName),
1135                            (componentSchemaLabel, componentSchemaLabel) TODO: # get component name from schema
1136                        ),
1137                        ...
1138                    ]
1139
1140        TODO: Return manifest URI instead of Synapse ID for interoperability with other implementations of a store interface
1141        """
1142        component = None
1143        entity = None
1144        manifests = []
1145
1146        datasets = self.getStorageDatasetsInProject(projectId)
1147
1148        for datasetId, datasetName in datasets:
1149            # encode information about the manifest in a simple list (so that R clients can unpack it)
1150            # eventually can serialize differently
1151
1152            # Get synID of manifest for a dataset
1153            manifestId = self.getDatasetManifest(datasetId)
1154
1155            # If a manifest exists, get the annotations for it, else return base 'manifest' tuple
1156            if manifestId:
1157                annotations = self.getFileAnnotations(manifestId)
1158
1159                # If manifest has annotations specifying component, use that
1160                if annotations and "Component" in annotations:
1161                    component = annotations["Component"]
1162                    entity = self.synapse_entity_tracker.get(
1163                        synapse_id=manifestId, syn=self.syn, download_file=False
1164                    )
1165                    manifest_name = entity["properties"]["name"]
1166
1167                # otherwise download the manifest and parse for information
1168                elif not annotations or "Component" not in annotations:
1169                    logging.debug(
1170                        f"No component annotations have been found for manifest {manifestId}. "
1171                        "The manifest will be downloaded and parsed instead. "
1172                        "For increased speed, add component annotations to manifest."
1173                    )
1174
1175                    manifest_info = self.getDatasetManifest(
1176                        datasetId, downloadFile=True
1177                    )
1178                    manifest_name = manifest_info["properties"].get("name", "")
1179
1180                    if not manifest_name:
1181                        logger.error(f"Failed to download manifests from {datasetId}")
1182
1183                    manifest_path = manifest_info["path"]
1184
1185                    manifest_df = load_df(manifest_path)
1186
1187                    # Get component from component column if it exists
1188                    if (
1189                        "Component" in manifest_df
1190                        and not manifest_df["Component"].empty
1191                    ):
1192                        list(set(manifest_df["Component"]))
1193                        component = list(set(manifest_df["Component"]))
1194
1195                        # Added to address issues raised during DCA testing
1196                        if "" in component:
1197                            component.remove("")
1198
1199                        if len(component) == 1:
1200                            component = component[0]
1201                        elif len(component) > 1:
1202                            logging.warning(
1203                                f"Manifest {manifestId} is composed of multiple components. Schematic does not support mulit-component manifests at this time."
1204                                "Behavior of manifests with multiple components is undefined"
1205                            )
1206            else:
1207                manifest_name = ""
1208                component = None
1209            if component:
1210                manifest = (
1211                    (datasetId, datasetName),
1212                    (manifestId, manifest_name),
1213                    (component, component),
1214                )
1215            elif manifestId:
1216                logging.debug(
1217                    f"Manifest {manifestId} does not have an associated Component"
1218                )
1219                manifest = (
1220                    (datasetId, datasetName),
1221                    (manifestId, manifest_name),
1222                    ("", ""),
1223                )
1224            else:
1225                manifest = (
1226                    (datasetId, datasetName),
1227                    ("", ""),
1228                    ("", ""),
1229                )
1230
1231            if manifest:
1232                manifests.append(manifest)
1233
1234        return manifests
1235
1236    def upload_project_manifests_to_synapse(
1237        self, dmge: DataModelGraphExplorer, projectId: str
1238    ) -> List[str]:
1239        """Upload all metadata manifest files across all datasets in a specified project as tables in Synapse.
1240
1241        Returns: String of all the manifest_table_ids of all the manifests that have been loaded.
1242        """
1243
1244        manifests = []
1245        manifest_loaded = []
1246        datasets = self.getStorageDatasetsInProject(projectId)
1247
1248        for datasetId, datasetName in datasets:
1249            # encode information about the manifest in a simple list (so that R clients can unpack it)
1250            # eventually can serialize differently
1251
1252            manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1253
1254            manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1255            if manifest_info:
1256                manifest_id = manifest_info["properties"]["id"]
1257                manifest_name = manifest_info["properties"]["name"]
1258                manifest_path = manifest_info["path"]
1259                manifest_df = load_df(manifest_path)
1260                manifest_table_id = uploadDB(
1261                    dmge=dmge,
1262                    manifest=manifest,
1263                    datasetId=datasetId,
1264                    table_name=datasetName,
1265                )
1266                manifest_loaded.append(datasetName)
1267        return manifest_loaded
1268
1269    def upload_annotated_project_manifests_to_synapse(
1270        self, projectId: str, path_to_json_ld: str, dry_run: bool = False
1271    ) -> List[str]:
1272        """
1273        Purpose:
1274            For all manifests in a project, upload them as a table and add annotations manifest csv.
1275            Assumes the manifest is already present as a CSV in a dataset in the project.
1276
1277        """
1278        # Instantiate DataModelParser
1279        data_model_parser = DataModelParser(path_to_data_model=path_to_json_ld)
1280        # Parse Model
1281        parsed_data_model = data_model_parser.parse_model()
1282
1283        # Instantiate DataModelGraph
1284        data_model_grapher = DataModelGraph(parsed_data_model)
1285
1286        # Generate graph
1287        graph_data_model = data_model_grapher.generate_data_model_graph()
1288
1289        # Instantiate DataModelGraphExplorer
1290        dmge = DataModelGraphExplorer(graph_data_model)
1291
1292        manifests = []
1293        manifest_loaded = []
1294        datasets = self.getStorageDatasetsInProject(projectId)
1295        for datasetId, datasetName in datasets:
1296            # encode information about the manifest in a simple list (so that R clients can unpack it)
1297            # eventually can serialize differently
1298
1299            manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1300            manifests.append(manifest)
1301
1302            manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1303
1304            if manifest_info:
1305                manifest_id = manifest_info["properties"]["id"]
1306                manifest_name = manifest_info["properties"]["name"]
1307                manifest_path = manifest_info["path"]
1308                manifest = (
1309                    (datasetId, datasetName),
1310                    (manifest_id, manifest_name),
1311                    ("", ""),
1312                )
1313                if not dry_run:
1314                    self.associateMetadataWithFiles(
1315                        dmge, manifest_path, datasetId, manifest_record_type="table"
1316                    )
1317                manifest_loaded.append(manifest)
1318
1319        return manifests, manifest_loaded
1320
1321    def move_entities_to_new_project(
1322        self,
1323        projectId: str,
1324        newProjectId: str,
1325        returnEntities: bool = False,
1326        dry_run: bool = False,
1327    ):
1328        """
1329        For each manifest csv in a project, look for all the entitiy ids that are associated.
1330        Look up the entitiy in the files, move the entity to new project.
1331        """
1332
1333        manifests = []
1334        manifest_loaded = []
1335        datasets = self.getStorageDatasetsInProject(projectId)
1336        if datasets:
1337            for datasetId, datasetName in datasets:
1338                # encode information about the manifest in a simple list (so that R clients can unpack it)
1339                # eventually can serialize differently
1340
1341                manifest = ((datasetId, datasetName), ("", ""), ("", ""))
1342                manifests.append(manifest)
1343
1344                manifest_info = self.getDatasetManifest(datasetId, downloadFile=True)
1345                if manifest_info:
1346                    manifest_id = manifest_info["properties"]["id"]
1347                    manifest_name = manifest_info["properties"]["name"]
1348                    manifest_path = manifest_info["path"]
1349                    manifest_df = load_df(manifest_path)
1350
1351                    manifest = (
1352                        (datasetId, datasetName),
1353                        (manifest_id, manifest_name),
1354                        ("", ""),
1355                    )
1356                    manifest_loaded.append(manifest)
1357
1358                    annotation_entities = self.storageFileviewTable[
1359                        (self.storageFileviewTable["id"].isin(manifest_df["entityId"]))
1360                        & (self.storageFileviewTable["type"] == "folder")
1361                    ]["id"]
1362
1363                    if returnEntities:
1364                        for entityId in annotation_entities:
1365                            if not dry_run:
1366                                moved_entity = self.syn.move(entityId, datasetId)
1367                                self.synapse_entity_tracker.add(
1368                                    synapse_id=moved_entity.id, entity=moved_entity
1369                                )
1370                            else:
1371                                logging.info(
1372                                    f"{entityId} will be moved to folder {datasetId}."
1373                                )
1374                    else:
1375                        # generate project folder
1376                        archive_project_folder = Folder(
1377                            projectId + "_archive", parent=newProjectId
1378                        )
1379                        archive_project_folder = self.syn.store(archive_project_folder)
1380                        self.synapse_entity_tracker.add(
1381                            synapse_id=archive_project_folder.id,
1382                            entity=archive_project_folder,
1383                        )
1384
1385                        # generate dataset folder
1386                        dataset_archive_folder = Folder(
1387                            "_".join([datasetId, datasetName, "archive"]),
1388                            parent=archive_project_folder.id,
1389                        )
1390                        dataset_archive_folder = self.syn.store(dataset_archive_folder)
1391                        self.synapse_entity_tracker.add(
1392                            synapse_id=dataset_archive_folder.id,
1393                            entity=dataset_archive_folder,
1394                        )
1395
1396                        for entityId in annotation_entities:
1397                            # move entities to folder
1398                            if not dry_run:
1399                                moved_entity = self.syn.move(
1400                                    entityId, dataset_archive_folder.id
1401                                )
1402                                self.synapse_entity_tracker.add(
1403                                    synapse_id=moved_entity.id, entity=moved_entity
1404                                )
1405                            else:
1406                                logging.info(
1407                                    f"{entityId} will be moved to folder {dataset_archive_folder.id}."
1408                                )
1409        else:
1410            raise LookupError(
1411                f"No datasets were found in the specified project: {projectId}. Re-check specified master_fileview in CONFIG and retry."
1412            )
1413        return manifests, manifest_loaded
1414
1415    @tracer.start_as_current_span("SynapseStorage::get_synapse_table")
1416    def get_synapse_table(self, synapse_id: str) -> Tuple[pd.DataFrame, CsvFileTable]:
1417        """Download synapse table as a pd dataframe; return table schema and etags as results too
1418
1419        Args:
1420            synapse_id: synapse ID of the table to query
1421        """
1422
1423        results = self.syn.tableQuery("SELECT * FROM {}".format(synapse_id))
1424        df = results.asDataFrame(
1425            rowIdAndVersionInIndex=False,
1426            na_values=STR_NA_VALUES_FILTERED,
1427            keep_default_na=False,
1428        )
1429
1430        return df, results
1431
1432    @missing_entity_handler
1433    @tracer.start_as_current_span("SynapseStorage::uploadDB")
1434    def uploadDB(
1435        self,
1436        dmge: DataModelGraphExplorer,
1437        manifest: pd.DataFrame,
1438        datasetId: str,
1439        table_name: str,
1440        restrict: bool = False,
1441        table_manipulation: str = "replace",
1442        table_column_names: str = "class_label",
1443    ):
1444        """
1445        Method to upload a database to an asset store. In synapse, this will upload a metadata table
1446
1447        Args:
1448            dmge: DataModelGraphExplorer object
1449            manifest: pd.Df manifest to upload
1450            datasetId: synID of the dataset for the manifest
1451            table_name: name of the table to be uploaded
1452            restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
1453            existingTableId: str of the synId of the existing table, if one already exists
1454            table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
1455            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
1456                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
1457                display label formatting.
1458        Returns:
1459            manifest_table_id: synID of the uploaded table
1460            manifest: the original manifset
1461            table_manifest: manifest formatted appropriately for the table
1462
1463        """
1464
1465        col_schema, table_manifest = self.formatDB(
1466            dmge=dmge, manifest=manifest, table_column_names=table_column_names
1467        )
1468
1469        manifest_table_id = self.buildDB(
1470            datasetId,
1471            table_name,
1472            col_schema,
1473            table_manifest,
1474            table_manipulation,
1475            dmge,
1476            restrict,
1477        )
1478
1479        return manifest_table_id, manifest, table_manifest
1480
1481    @tracer.start_as_current_span("SynapseStorage::formatDB")
1482    def formatDB(self, dmge, manifest, table_column_names):
1483        """
1484        Method to format a manifest appropriatly for upload as table
1485
1486        Args:
1487            dmge: DataModelGraphExplorer object
1488            manifest: pd.Df manifest to upload
1489            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
1490                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
1491                display label formatting.
1492        Returns:
1493            col_schema: schema for table columns: type, size, etc
1494            table_manifest: formatted manifest
1495
1496        """
1497        # Rename the manifest columns to display names to match fileview
1498
1499        blacklist_chars = ["(", ")", ".", " ", "-"]
1500        manifest_columns = manifest.columns.tolist()
1501
1502        table_manifest = deepcopy(manifest)
1503
1504        if table_column_names == "display_name":
1505            cols = table_manifest.columns
1506
1507        elif table_column_names == "display_label":
1508            cols = [
1509                str(col).translate({ord(x): "" for x in blacklist_chars})
1510                for col in manifest_columns
1511            ]
1512
1513        elif table_column_names == "class_label":
1514            cols = [
1515                get_class_label_from_display_name(str(col)).translate(
1516                    {ord(x): "" for x in blacklist_chars}
1517                )
1518                for col in manifest_columns
1519            ]
1520        else:
1521            ValueError(
1522                f"The provided table_column_name: {table_column_names} is not valid, please resubmit with an allowed value only."
1523            )
1524
1525        cols = list(map(lambda x: x.replace("EntityId", "entityId"), cols))
1526
1527        # Reset column names in table manifest
1528        table_manifest.columns = cols
1529
1530        # move entity id to end of df
1531        entity_col = table_manifest.pop("entityId")
1532        table_manifest.insert(len(table_manifest.columns), "entityId", entity_col)
1533
1534        # Get the column schema
1535        col_schema = as_table_columns(table_manifest)
1536
1537        # Set Id column length to 64 (for some reason not being auto set.)
1538        for i, col in enumerate(col_schema):
1539            if col["name"].lower() == "id":
1540                col_schema[i]["maximumSize"] = 64
1541
1542        return col_schema, table_manifest
1543
1544    @tracer.start_as_current_span("SynapseStorage::buildDB")
1545    def buildDB(
1546        self,
1547        datasetId: str,
1548        table_name: str,
1549        col_schema: List,
1550        table_manifest: pd.DataFrame,
1551        table_manipulation: str,
1552        dmge: DataModelGraphExplorer,
1553        restrict: bool = False,
1554    ):
1555        """
1556        Method to construct the table appropriately: create new table, replace existing, or upsert new into existing
1557        Calls TableOperations class to execute
1558
1559        Args:
1560            datasetId: synID of the dataset for the manifest
1561            table_name: name of the table to be uploaded
1562            col_schema: schema for table columns: type, size, etc from `formatDB`
1563            table_manifest: formatted manifest that can be uploaded as a table
1564            table_manipulation: str, 'replace' or 'upsert', in the case where a manifest already exists, should the new metadata replace the existing (replace) or be added to it (upsert)
1565            restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
1566
1567        Returns:
1568            manifest_table_id: synID of the uploaded table
1569
1570        """
1571        table_parent_id = self.getDatasetProject(datasetId=datasetId)
1572        existing_table_id = self.syn.findEntityId(
1573            name=table_name, parent=table_parent_id
1574        )
1575        tableOps = TableOperations(
1576            synStore=self,
1577            tableToLoad=table_manifest,
1578            tableName=table_name,
1579            datasetId=datasetId,
1580            existingTableId=existing_table_id,
1581            restrict=restrict,
1582            synapse_entity_tracker=self.synapse_entity_tracker,
1583        )
1584
1585        if not table_manipulation or existing_table_id is None:
1586            manifest_table_id = tableOps.createTable(
1587                columnTypeDict=col_schema,
1588                specifySchema=True,
1589            )
1590        elif existing_table_id is not None:
1591            if table_manipulation.lower() == "replace":
1592                manifest_table_id = tableOps.replaceTable(
1593                    specifySchema=True,
1594                    columnTypeDict=col_schema,
1595                )
1596            elif table_manipulation.lower() == "upsert":
1597                manifest_table_id = tableOps.upsertTable(
1598                    dmge=dmge,
1599                )
1600            elif table_manipulation.lower() == "update":
1601                manifest_table_id = tableOps.updateTable()
1602
1603        if table_manipulation and table_manipulation.lower() == "upsert":
1604            table_entity = self.synapse_entity_tracker.get(
1605                synapse_id=existing_table_id or manifest_table_id,
1606                syn=self.syn,
1607                download_file=False,
1608            )
1609            annos = OldAnnotations(
1610                id=table_entity.id,
1611                etag=table_entity.etag,
1612                values=table_entity.annotations,
1613            )
1614            annos["primary_key"] = table_manifest["Component"][0] + "_id"
1615            annos = self.syn.set_annotations(annos)
1616            table_entity.etag = annos.etag
1617            table_entity.annotations = annos
1618
1619        return manifest_table_id
1620
1621    @tracer.start_as_current_span("SynapseStorage::upload_manifest_file")
1622    def upload_manifest_file(
1623        self,
1624        manifest,
1625        metadataManifestPath,
1626        datasetId,
1627        restrict_manifest,
1628        component_name="",
1629    ):
1630        # Update manifest to have the new entityId column
1631        manifest.to_csv(metadataManifestPath, index=False)
1632
1633        # store manifest to Synapse as a CSV
1634        # update file name
1635        file_name_full = metadataManifestPath.split("/")[-1]
1636        file_extension = file_name_full.split(".")[-1]
1637
1638        # Differentiate "censored" and "uncensored" manifest
1639        if "censored" in file_name_full:
1640            file_name_new = (
1641                os.path.basename(CONFIG.synapse_manifest_basename)
1642                + "_"
1643                + component_name
1644                + "_censored"
1645                + "."
1646                + file_extension
1647            )
1648        else:
1649            file_name_new = (
1650                os.path.basename(CONFIG.synapse_manifest_basename)
1651                + "_"
1652                + component_name
1653                + "."
1654                + file_extension
1655            )
1656
1657        manifest_synapse_file = None
1658        try:
1659            # Rename the file to file_name_new then revert
1660            # This is to maintain the original file name in-case other code is
1661            # expecting that the file exists with the original name
1662            original_file_path = metadataManifestPath
1663            new_file_path = os.path.join(
1664                os.path.dirname(metadataManifestPath), file_name_new
1665            )
1666            os.rename(original_file_path, new_file_path)
1667
1668            manifest_synapse_file = self._store_file_for_manifest_upload(
1669                new_file_path=new_file_path,
1670                dataset_id=datasetId,
1671                existing_file_name=file_name_full,
1672                file_name_new=file_name_new,
1673                restrict_manifest=restrict_manifest,
1674            )
1675            manifest_synapse_file_id = manifest_synapse_file.id
1676
1677        finally:
1678            # Revert the file name back to the original
1679            os.rename(new_file_path, original_file_path)
1680
1681            if manifest_synapse_file:
1682                manifest_synapse_file.path = original_file_path
1683
1684        return manifest_synapse_file_id
1685
1686    def _store_file_for_manifest_upload(
1687        self,
1688        new_file_path: str,
1689        dataset_id: str,
1690        existing_file_name: str,
1691        file_name_new: str,
1692        restrict_manifest: bool,
1693    ) -> File:
1694        """Handles a create or update of a manifest file that is going to be uploaded.
1695        If we already have a copy of the Entity in memory we will update that instance,
1696        otherwise create a new File instance to be created in Synapse. Once stored
1697        this will add the file to the `synapse_entity_tracker` for future reference.
1698
1699        Args:
1700            new_file_path (str): The path to the new manifest file
1701            dataset_id (str): The Synapse ID of the dataset the manifest is associated with
1702            existing_file_name (str): The name of the existing file
1703            file_name_new (str): The name of the new file
1704            restrict_manifest (bool): Whether the manifest should be restricted
1705
1706        Returns:
1707            File: The stored manifest file
1708        """
1709        local_tracked_file_instance = (
1710            self.synapse_entity_tracker.search_local_by_parent_and_name(
1711                name=existing_file_name, parent_id=dataset_id
1712            )
1713            or self.synapse_entity_tracker.search_local_by_parent_and_name(
1714                name=file_name_new, parent_id=dataset_id
1715            )
1716        )
1717
1718        if local_tracked_file_instance:
1719            local_tracked_file_instance.path = new_file_path
1720            local_tracked_file_instance.description = (
1721                "Manifest for dataset " + dataset_id
1722            )
1723            manifest_synapse_file = local_tracked_file_instance
1724        else:
1725            manifest_synapse_file = File(
1726                path=new_file_path,
1727                description="Manifest for dataset " + dataset_id,
1728                parent=dataset_id,
1729                name=file_name_new,
1730            )
1731
1732        manifest_synapse_file = self.syn.store(
1733            manifest_synapse_file, isRestricted=restrict_manifest
1734        )
1735
1736        self.synapse_entity_tracker.add(
1737            synapse_id=manifest_synapse_file.id, entity=manifest_synapse_file
1738        )
1739        return manifest_synapse_file
1740
1741    async def get_async_annotation(self, synapse_id: str) -> Dict[str, Any]:
1742        """get annotations asynchronously
1743
1744        Args:
1745            synapse_id (str): synapse id of the entity that the annotation belongs
1746
1747        Returns:
1748            Dict[str, Any]: The requested entity bundle matching
1749            <https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/entitybundle/v2/EntityBundle.html>
1750        """
1751        return await get_entity_id_bundle2(
1752            entity_id=synapse_id,
1753            request={"includeAnnotations": True},
1754            synapse_client=self.syn,
1755        )
1756
1757    async def store_async_annotation(self, annotation_dict: dict) -> Annotations:
1758        """store annotation in an async way
1759
1760        Args:
1761            annotation_dict (dict): annotation in a dictionary format
1762
1763        Returns:
1764            Annotations: The stored annotations.
1765        """
1766        annotation_data = Annotations.from_dict(
1767            synapse_annotations=annotation_dict["annotations"]["annotations"]
1768        )
1769        annotation_class = Annotations(
1770            annotations=annotation_data,
1771            etag=annotation_dict["annotations"]["etag"],
1772            id=annotation_dict["annotations"]["id"],
1773        )
1774        annotation_storage_result = await annotation_class.store_async(
1775            synapse_client=self.syn
1776        )
1777        local_entity = self.synapse_entity_tracker.get(
1778            synapse_id=annotation_dict["annotations"]["id"],
1779            syn=self.syn,
1780            download_file=False,
1781            retrieve_if_not_present=False,
1782        )
1783        if local_entity:
1784            local_entity.etag = annotation_storage_result.etag
1785            local_entity.annotations = annotation_storage_result
1786        return annotation_storage_result
1787
1788    def process_row_annotations(
1789        self,
1790        dmge: DataModelGraphExplorer,
1791        metadata_syn: Dict[str, Any],
1792        hide_blanks: bool,
1793        csv_list_regex: str,
1794        annos: Dict[str, Any],
1795        annotation_keys: str,
1796    ) -> Dict[str, Any]:
1797        """Processes metadata annotations based on the logic below:
1798        1. Checks if the hide_blanks flag is True, and if the current annotation value (anno_v) is:
1799            An empty or whitespace-only string.
1800            A NaN value (if the annotation is a float).
1801        if any of the above conditions are met, and hide_blanks is True, the annotation key is not going to be uploaded and skips further processing of that annotation key.
1802        if any of the above conditions are met, and hide_blanks is False, assigns an empty string "" as the annotation value for that key.
1803
1804        2. If the value is a string and matches the pattern defined by csv_list_regex, get validation rule based on "node label" or "node display name".
1805        Check if the rule contains "list" as a rule, if it does, split the string by comma and assign the resulting list as the annotation value for that key.
1806
1807        3. For any other conditions, assigns the original value of anno_v to the annotation key (anno_k).
1808
1809        4. Returns the updated annotations dictionary.
1810
1811        Args:
1812            dmge (DataModelGraphExplorer): data model graph explorer
1813            metadata_syn (dict): metadata used for Synapse storage
1814            hideBlanks (bool): if true, does not upload annotation keys with blank values.
1815            csv_list_regex (str): Regex to match with comma separated list
1816            annos (Dict[str, Any]): dictionary of annotation returned from synapse
1817            annotation_keys (str): display_label/class_label
1818
1819        Returns:
1820            Dict[str, Any]: annotations as a dictionary
1821
1822        ```mermaid
1823        flowchart TD
1824            A[Start] --> C{Is anno_v empty, whitespace, or NaN?}
1825            C -- Yes --> D{Is hide_blanks True?}
1826            D -- Yes --> E[Remove this annotation key from the annotation dictionary to be uploaded. Skip further processing]
1827            D -- No --> F[Assign empty string to annotation key]
1828            C -- No --> G{Is anno_v a string?}
1829            G -- No --> H[Assign original value of anno_v to annotation key]
1830            G -- Yes --> I{Does anno_v match csv_list_regex?}
1831            I -- Yes --> J[Get validation rule of anno_k]
1832            J --> K{Does the validation rule contain 'list'}
1833            K -- Yes --> L[Split anno_v by commas and assign as list]
1834            I -- No --> H
1835            K -- No --> H
1836        ```
1837        """
1838        for anno_k, anno_v in metadata_syn.items():
1839            # Remove keys with nan or empty string values or string that only contains white space from dict of annotations to be uploaded
1840            # if present on current data annotation
1841            if hide_blanks and (
1842                (isinstance(anno_v, str) and anno_v.strip() == "")
1843                or (isinstance(anno_v, float) and np.isnan(anno_v))
1844            ):
1845                annos["annotations"]["annotations"].pop(anno_k) if anno_k in annos[
1846                    "annotations"
1847                ]["annotations"].keys() else annos["annotations"]["annotations"]
1848                continue
1849
1850            # Otherwise save annotation as approrpriate
1851            if isinstance(anno_v, float) and np.isnan(anno_v):
1852                annos["annotations"]["annotations"][anno_k] = ""
1853                continue
1854
1855            # Handle strings that match the csv_list_regex and pass the validation rule
1856            if isinstance(anno_v, str) and re.fullmatch(csv_list_regex, anno_v):
1857                # Use a dictionary to dynamically choose the argument
1858                param = (
1859                    {"node_display_name": anno_k}
1860                    if annotation_keys == "display_label"
1861                    else {"node_label": anno_k}
1862                )
1863                node_validation_rules = dmge.get_node_validation_rules(**param)
1864
1865                if rule_in_rule_list("list", node_validation_rules):
1866                    annos["annotations"]["annotations"][anno_k] = anno_v.split(",")
1867                    continue
1868            # default: assign the original value
1869            annos["annotations"]["annotations"][anno_k] = anno_v
1870
1871        return annos
1872
1873    @async_missing_entity_handler
1874    async def format_row_annotations(
1875        self,
1876        dmge: DataModelGraphExplorer,
1877        row: pd.Series,
1878        entityId: str,
1879        hideBlanks: bool,
1880        annotation_keys: str,
1881    ) -> Union[None, Dict[str, Any]]:
1882        """Format row annotations
1883
1884        Args:
1885            dmge (DataModelGraphExplorer): data moodel graph explorer object
1886            row (pd.Series): row of the manifest
1887            entityId (str): entity id of the manifest
1888            hideBlanks (bool): when true, does not upload annotation keys with blank values. When false, upload Annotation keys with empty string values
1889            annotation_keys (str): display_label/class_label
1890
1891        Returns:
1892            Union[None, Dict[str,]]: if entity id is in trash can, return None. Otherwise, return the annotations
1893        """
1894        # prepare metadata for Synapse storage (resolve display name into a name that Synapse annotations support (e.g no spaces, parenthesis)
1895        # note: the removal of special characters, will apply only to annotation keys; we are not altering the manifest
1896        # this could create a divergence between manifest column and annotations. this should be ok for most use cases.
1897        # columns with special characters are outside of the schema
1898        metadataSyn = {}
1899        blacklist_chars = ["(", ")", ".", " ", "-"]
1900
1901        for k, v in row.to_dict().items():
1902            if annotation_keys == "display_label":
1903                keySyn = str(k).translate({ord(x): "" for x in blacklist_chars})
1904            elif annotation_keys == "class_label":
1905                keySyn = get_class_label_from_display_name(str(k)).translate(
1906                    {ord(x): "" for x in blacklist_chars}
1907                )
1908
1909            # Skip `Filename` and `ETag` columns when setting annotations
1910            if keySyn in ["Filename", "ETag", "eTag"]:
1911                continue
1912
1913            # truncate annotation values to 500 characters if the
1914            # size of values is greater than equal to 500 characters
1915            # add an explicit [truncatedByDataCuratorApp] message at the end
1916            # of every truncated message to indicate that the cell value
1917            # has been truncated
1918            if isinstance(v, str) and len(v) >= 500:
1919                v = v[0:472] + "[truncatedByDataCuratorApp]"
1920
1921            metadataSyn[keySyn] = v
1922
1923        # This will first check if the entity is already in memory, and if so, that
1924        # instance is used. Unfortunately, the expected return format needs to match
1925        # the Synapse API, so we need to convert the annotations to the expected format.
1926        entity = self.synapse_entity_tracker.get(
1927            synapse_id=entityId,
1928            syn=self.syn,
1929            download_file=False,
1930            retrieve_if_not_present=False,
1931        )
1932        if entity is not None:
1933            synapse_annotations = _convert_to_annotations_list(
1934                annotations=entity.annotations
1935            )
1936            annos = {
1937                "annotations": {
1938                    "id": entity.id,
1939                    "etag": entity.etag,
1940                    "annotations": synapse_annotations,
1941                }
1942            }
1943        else:
1944            annos = await self.get_async_annotation(entityId)
1945
1946        # set annotation(s) for the various objects/items in a dataset on Synapse
1947        csv_list_regex = comma_separated_list_regex()
1948
1949        annos = self.process_row_annotations(
1950            dmge=dmge,
1951            metadata_syn=metadataSyn,
1952            hide_blanks=hideBlanks,
1953            csv_list_regex=csv_list_regex,
1954            annos=annos,
1955            annotation_keys=annotation_keys,
1956        )
1957
1958        return annos
1959
1960    @missing_entity_handler
1961    @tracer.start_as_current_span("SynapseStorage::format_manifest_annotations")
1962    def format_manifest_annotations(self, manifest, manifest_synapse_id):
1963        """
1964        Set annotations for the manifest (as a whole) so they can be applied to the manifest table or csv.
1965        For now just getting the Component.
1966        """
1967
1968        entity = self.synapse_entity_tracker.get(
1969            synapse_id=manifest_synapse_id, syn=self.syn, download_file=False
1970        )
1971        is_file = entity.concreteType.endswith(".FileEntity")
1972        is_table = entity.concreteType.endswith(".TableEntity")
1973
1974        if is_file:
1975            # Get file metadata
1976            metadata = self.getFileAnnotations(manifest_synapse_id)
1977
1978            # If there is a defined component add it to the metadata.
1979            if "Component" in manifest.columns:
1980                # Gather component information
1981                component = manifest["Component"].unique()
1982
1983                # Double check that only a single component is listed, else raise an error.
1984                try:
1985                    len(component) == 1
1986                except ValueError as err:
1987                    raise ValueError(
1988                        f"Manifest has more than one component. Please check manifest and resubmit."
1989                    ) from err
1990
1991                # Add component to metadata
1992                metadata["Component"] = component[0]
1993
1994        elif is_table:
1995            # Get table metadata
1996            metadata = self.getTableAnnotations(manifest_synapse_id)
1997
1998        # Get annotations
1999        annos = OldAnnotations(
2000            id=entity.id, etag=entity.etag, values=entity.annotations
2001        )
2002
2003        # Add metadata to the annotations
2004        for annos_k, annos_v in metadata.items():
2005            annos[annos_k] = annos_v
2006        return annos
2007
2008    '''
2009    def annotate_upload_manifest_table(self, manifest, datasetId, metadataManifestPath,
2010        useSchemaLabel: bool = True, hideBlanks: bool = False, restrict_manifest = False):
2011        """
2012        Purpose:
2013            Works very similarly to associateMetadataWithFiles except takes in the manifest
2014            rather than the manifest path
2015
2016        """
2017
2018        # Add uuid for table updates and fill.
2019        if not "Uuid" in manifest.columns:
2020            manifest["Uuid"] = ''
2021
2022        for idx,row in manifest.iterrows():
2023            if not row["Uuid"]:
2024                gen_uuid = uuid.uuid4()
2025                row["Uuid"] = gen_uuid
2026                manifest.loc[idx, 'Uuid'] = gen_uuid
2027
2028        # add entityId as a column if not already there or
2029        # fill any blanks with an empty string.
2030        if not "entityId" in manifest.columns:
2031            manifest["entityId"] = ""
2032        else:
2033            manifest["entityId"].fillna("", inplace=True)
2034
2035        # get a DataModelGraphExplorer object to ensure schema attribute names used in manifest are translated to schema labels for synapse annotations
2036        dmge = DataModelGraphExplorer()
2037
2038        # Create table name here.
2039        if 'Component' in manifest.columns:
2040            table_name = manifest['Component'][0].lower() + '_synapse_storage_manifest_table'
2041        else:
2042            table_name = 'synapse_storage_manifest_table'
2043
2044        # Upload manifest as a table and get the SynID and manifest
2045        manifest_synapse_table_id, manifest, table_manifest = self.upload_format_manifest_table(
2046                                                    dmge, manifest, datasetId, table_name, restrict = restrict_manifest, useSchemaLabel=useSchemaLabel,)
2047
2048        # Iterate over manifest rows, create Synapse entities and store corresponding entity IDs in manifest if needed
2049        # also set metadata for each synapse entity as Synapse annotations
2050        for idx, row in manifest.iterrows():
2051            if not row["entityId"]:
2052                # If not using entityIds, fill with manifest_table_id so
2053                row["entityId"] = manifest_synapse_table_id
2054                entityId = ''
2055            else:
2056                # get the entity id corresponding to this row
2057                entityId = row["entityId"]
2058
2059        # Load manifest to synapse as a CSV File
2060        manifest_synapse_file_id = self.upload_manifest_file(manifest, metadataManifestPath, datasetId, restrict_manifest)
2061
2062        # Get annotations for the file manifest.
2063        manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_file_id)
2064
2065        self.syn.set_annotations(manifest_annotations)
2066
2067        logger.info("Associated manifest file with dataset on Synapse.")
2068
2069        # Update manifest Synapse table with new entity id column.
2070        self.make_synapse_table(
2071            table_to_load = table_manifest,
2072            dataset_id = datasetId,
2073            existingTableId = manifest_synapse_table_id,
2074            table_name = table_name,
2075            update_col = 'Uuid',
2076            specify_schema = False,
2077            )
2078
2079        # Get annotations for the table manifest
2080        manifest_annotations = self.format_manifest_annotations(manifest, manifest_synapse_table_id)
2081        self.syn.set_annotations(manifest_annotations)
2082        return manifest_synapse_table_id
2083    '''
2084
2085    def _read_manifest(self, metadataManifestPath: str) -> pd.DataFrame:
2086        """Helper function to read in provided manifest as a pandas DataFrame for subsequent downstream processing.
2087        Args:
2088            metadataManifestPath (str): path where manifest is stored
2089        Returns:
2090            manifest(pd.DataFrame): Manifest loaded as a pandas dataframe
2091        Raises:
2092            FileNotFoundError: Manifest file does not exist at provided path.
2093        """
2094        # read new manifest csv
2095        try:
2096            load_args = {
2097                "dtype": "string",
2098            }
2099            manifest = load_df(
2100                metadataManifestPath,
2101                preserve_raw_input=False,
2102                allow_na_values=False,
2103                **load_args,
2104            )
2105        except FileNotFoundError as err:
2106            raise FileNotFoundError(
2107                f"No manifest file was found at this path: {metadataManifestPath}"
2108            ) from err
2109        return manifest
2110
2111    def _add_id_columns_to_manifest(
2112        self, manifest: pd.DataFrame, dmge: DataModelGraphExplorer
2113    ) -> pd.DataFrame:
2114        """
2115        Ensures that the manifest DataFrame has standardized 'Id' and 'entityId' columns.
2116
2117        - If any case variation of the 'id' column is present (e.g., 'id', 'ID', 'iD'), it is renamed to 'Id'.
2118        - If any case variation of the 'entityid' column is present, it is renamed to 'entityId'.
2119        - If any case variation of the 'uuid' column is present, it is renamed to 'uuid' before further processing.
2120        - If 'Id' is still missing:
2121            - It will be created as an empty column, or
2122            - Derived from a 'Uuid' column, depending on whether 'uuid' is defined in the schema.
2123        - If both 'uuid' and 'Id' columns exist, the 'uuid' column is dropped.
2124        - Missing values in the 'Id' column are filled with generated UUIDs.
2125        - If 'entityId' is still missing, it will be created and filled with empty strings.
2126        - If 'entityId' is already present, any missing values will be replaced with empty strings.
2127
2128        Args:
2129            manifest (pd.DataFrame): The metadata manifest to be updated.
2130            dmge (DataModelGraphExplorer): Data model graph explorer object.
2131
2132        Returns:
2133            pd.DataFrame: The updated manifest with a standardized 'Id' column and an 'entityId' column.
2134        """
2135
2136        # Normalize any variation of 'id' to 'Id', "entityid" to "entityId", "Uuid" to "uuid"
2137        for col in manifest.columns:
2138            if col.lower() == "id":
2139                manifest = manifest.rename(columns={col: ID_COLUMN})
2140            if col.lower() == "entityid":
2141                manifest = manifest.rename(columns={col: ENTITY_ID_COLUMN})
2142            if col.lower() == "uuid":
2143                manifest = manifest.rename(columns={col: UUID_COLUMN})
2144
2145        # If 'Id' still doesn't exist, see if uuid column exists
2146        # Rename uuid column to "Id" column
2147        if ID_COLUMN not in manifest.columns:
2148            # See if schema has `Uuid` column specified
2149            try:
2150                uuid_col_in_schema = dmge.is_class_in_schema(
2151                    "Uuid"
2152                ) or dmge.is_class_in_schema("uuid")
2153            except KeyError:
2154                uuid_col_in_schema = False
2155
2156            # Rename `uuid` column if it wasn't specified in the schema
2157            if UUID_COLUMN in manifest.columns and not uuid_col_in_schema:
2158                manifest = manifest.rename(columns={UUID_COLUMN: ID_COLUMN})
2159            # If no `uuid` column exists or it is specified in the schema, create a new `Id` column
2160            else:
2161                manifest[ID_COLUMN] = ""
2162        else:
2163            # 'Id' already exists, ignore 'uuid'
2164            if UUID_COLUMN in manifest.columns:
2165                manifest = manifest.drop(columns=[UUID_COLUMN])
2166
2167        # Fill in UUIDs in the "Id" column if missing
2168        for idx, row in manifest.iterrows():
2169            if not row["Id"]:
2170                gen_uuid = str(uuid.uuid4())
2171                row["Id"] = gen_uuid
2172                manifest.loc[idx, ID_COLUMN] = gen_uuid
2173
2174        # Add entityId as a column if not already there
2175        if ENTITY_ID_COLUMN not in manifest:
2176            manifest[ENTITY_ID_COLUMN] = ""
2177        else:
2178            manifest[ENTITY_ID_COLUMN] = manifest[ENTITY_ID_COLUMN].fillna("")
2179
2180        return manifest
2181
2182    def _generate_table_name(self, manifest):
2183        """Helper function to generate a table name for upload to synapse.
2184
2185        Args:
2186            Manifest loaded as a pd.Dataframe
2187
2188        Returns:
2189            table_name (str): Name of the table to load
2190            component_name (str): Name of the manifest component (if applicable)
2191        """
2192        # Create table name here.
2193        if "Component" in manifest.columns:
2194            component_name = manifest["Component"][0].lower()
2195            table_name = component_name + "_synapse_storage_manifest_table"
2196        else:
2197            component_name = ""
2198            table_name = "synapse_storage_manifest_table"
2199        return table_name, component_name
2200
2201    def _create_entity_id(self, idx, row, manifest, datasetId):
2202        """Helper function to generate an entityId and add it to the appropriate row in the manifest.
2203        Args:
2204            row: current row of manifest being processed
2205            manifest (pd.DataFrame): loaded df containing user supplied data.
2206            datasetId (str): synapse ID of folder containing the dataset
2207
2208        Returns:
2209            manifest (pd.DataFrame): manifest with entityId added to the appropriate row
2210            entityId (str): Generated Entity Id.
2211
2212        """
2213        rowEntity = Folder(str(uuid.uuid4()), parent=datasetId)
2214        rowEntity = self.syn.store(rowEntity)
2215        entityId = rowEntity["id"]
2216        self.synapse_entity_tracker.add(synapse_id=entityId, entity=rowEntity)
2217        row["entityId"] = entityId
2218        manifest.loc[idx, "entityId"] = entityId
2219        return manifest, entityId
2220
2221    async def _process_store_annos(self, requests: Set[asyncio.Task]) -> None:
2222        """Process annotations and store them on synapse asynchronously
2223
2224        Args:
2225            requests (Set[asyncio.Task]): a set of tasks of formatting annotations created by format_row_annotations function in previous step
2226
2227        Raises:
2228            RuntimeError: raise a run time error if a task failed to complete
2229        """
2230        while requests:
2231            done_tasks, pending_tasks = await asyncio.wait(
2232                requests, return_when=asyncio.FIRST_COMPLETED
2233            )
2234            requests = pending_tasks
2235
2236            for completed_task in done_tasks:
2237                try:
2238                    annos = completed_task.result()
2239
2240                    if isinstance(annos, Annotations):
2241                        logger.info(f"Successfully stored annotations for {annos.id}")
2242                    else:
2243                        # store annotations if they are not None
2244                        if annos:
2245                            entity_id = annos["annotations"]["id"]
2246                            logger.info(
2247                                f"Obtained and processed annotations for {entity_id} entity"
2248                            )
2249                            requests.add(
2250                                asyncio.create_task(
2251                                    self.store_async_annotation(annotation_dict=annos)
2252                                )
2253                            )
2254                except Exception as e:
2255                    raise RuntimeError(f"failed with { repr(e) }.") from e
2256
2257    @tracer.start_as_current_span("SynapseStorage::add_annotations_to_entities_files")
2258    async def add_annotations_to_entities_files(
2259        self,
2260        dmge,
2261        manifest,
2262        manifest_record_type: str,
2263        datasetId: str,
2264        hideBlanks: bool,
2265        manifest_synapse_table_id="",
2266        annotation_keys: str = "class_label",
2267    ):
2268        """
2269        Depending on upload type add Ids to entityId row. Add anotations to connected
2270        files and folders. Despite the name of this function, it also applies to folders.
2271
2272        Args:
2273            dmge: DataModelGraphExplorer Object
2274            manifest (pd.DataFrame): loaded df containing user supplied data.
2275            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2276            datasetId (str): synapse ID of folder containing the dataset
2277            hideBlanks (bool): Default is false -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2278            manifest_synapse_table_id (str): Default is an empty string ''.
2279            annotation_keys: (str) display_label/class_label(default), Determines labeling syle for annotation keys. class_label will format the display
2280                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2281                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2282        Returns:
2283            manifest (pd.DataFrame): modified to add entitiyId as appropriate
2284
2285        """
2286
2287        # Expected behavior is to annotate files if `Filename` is present and if file_annotations_upload is set to True regardless of `-mrt` setting
2288        if "filename" in [col.lower() for col in manifest.columns]:
2289            # get current list of files and store as dataframe
2290            dataset_files = self.getFilesInStorageDataset(datasetId)
2291            files_and_entityIds = self._get_file_entityIds(
2292                dataset_files=dataset_files, only_new_files=False
2293            )
2294            file_df = pd.DataFrame(files_and_entityIds)
2295
2296            # Merge dataframes to add entityIds
2297            manifest = manifest.merge(
2298                file_df, how="left", on="Filename", suffixes=["_x", None]
2299            ).drop("entityId_x", axis=1)
2300
2301        # Fill `entityId` for each row if missing and annotate entity as appropriate
2302        requests = set()
2303        for idx, row in manifest.iterrows():
2304            if not row["entityId"] and (
2305                manifest_record_type == "file_and_entities"
2306                or manifest_record_type == "table_file_and_entities"
2307            ):
2308                manifest, entityId = self._create_entity_id(
2309                    idx, row, manifest, datasetId
2310                )
2311            elif not row["entityId"] and manifest_record_type == "table_and_file":
2312                # If not using entityIds, fill with manifest_table_id so
2313                row["entityId"] = manifest_synapse_table_id
2314                manifest.loc[idx, "entityId"] = manifest_synapse_table_id
2315                entityId = ""
2316                # If the row is the manifest table, do not add annotations
2317            elif row["entityId"] == manifest_synapse_table_id:
2318                entityId = ""
2319            else:
2320                # get the file id of the file to annotate, collected in above step.
2321                entityId = row["entityId"]
2322
2323            # Adding annotations to connected files.
2324            if entityId:
2325                # Format annotations for Synapse
2326                annos_task = asyncio.create_task(
2327                    self.format_row_annotations(
2328                        dmge, row, entityId, hideBlanks, annotation_keys
2329                    )
2330                )
2331                requests.add(annos_task)
2332        await self._process_store_annos(requests)
2333        return manifest
2334
2335    @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_table")
2336    def upload_manifest_as_table(
2337        self,
2338        dmge: DataModelGraphExplorer,
2339        manifest: pd.DataFrame,
2340        metadataManifestPath: str,
2341        datasetId: str,
2342        table_name: str,
2343        component_name: str,
2344        restrict: bool,
2345        manifest_record_type: str,
2346        hideBlanks: bool,
2347        table_manipulation: str,
2348        table_column_names: str,
2349        annotation_keys: str,
2350        file_annotations_upload: bool = True,
2351    ):
2352        """Upload manifest to Synapse as a table and csv.
2353        Args:
2354            dmge: DataModelGraphExplorer object
2355            manifest (pd.DataFrame): loaded df containing user supplied data.
2356            metadataManifestPath: path to csv containing a validated metadata manifest.
2357            datasetId (str): synapse ID of folder containing the dataset
2358            table_name (str): Generated to name the table being uploaded.
2359            component_name (str): Name of the component manifest that is currently being uploaded.
2360            restrict (bool): Flag for censored data.
2361            manifest_record_type (str): valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2362            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2363            table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2364            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2365                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2366                display label formatting.
2367            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2368                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2369                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2370            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2371        Return:
2372            manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2373        """
2374        # Upload manifest as a table, get the ID and updated manifest.
2375        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2376            dmge=dmge,
2377            manifest=manifest,
2378            datasetId=datasetId,
2379            table_name=table_name,
2380            restrict=restrict,
2381            table_manipulation=table_manipulation,
2382            table_column_names=table_column_names,
2383        )
2384
2385        if file_annotations_upload:
2386            manifest = asyncio.run(
2387                self.add_annotations_to_entities_files(
2388                    dmge,
2389                    manifest,
2390                    manifest_record_type,
2391                    datasetId,
2392                    hideBlanks,
2393                    manifest_synapse_table_id,
2394                    annotation_keys,
2395                )
2396            )
2397        # Load manifest to synapse as a CSV File
2398        manifest_synapse_file_id = self.upload_manifest_file(
2399            manifest=manifest,
2400            metadataManifestPath=metadataManifestPath,
2401            datasetId=datasetId,
2402            restrict_manifest=restrict,
2403            component_name=component_name,
2404        )
2405
2406        # Set annotations for the file manifest.
2407        manifest_annotations = self.format_manifest_annotations(
2408            manifest=manifest, manifest_synapse_id=manifest_synapse_file_id
2409        )
2410        annos = self.syn.set_annotations(annotations=manifest_annotations)
2411        manifest_entity = self.synapse_entity_tracker.get(
2412            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2413        )
2414        manifest_entity.annotations = annos
2415        manifest_entity.etag = annos.etag
2416
2417        logger.info("Associated manifest file with dataset on Synapse.")
2418
2419        # Update manifest Synapse table with new entity id column.
2420        manifest_synapse_table_id, manifest, _ = self.uploadDB(
2421            dmge=dmge,
2422            manifest=manifest,
2423            datasetId=datasetId,
2424            table_name=table_name,
2425            restrict=restrict,
2426            table_manipulation="update",
2427            table_column_names=table_column_names,
2428        )
2429
2430        # Set annotations for the table manifest
2431        manifest_annotations = self.format_manifest_annotations(
2432            manifest=manifest, manifest_synapse_id=manifest_synapse_table_id
2433        )
2434        annotations_manifest_table = self.syn.set_annotations(
2435            annotations=manifest_annotations
2436        )
2437        manifest_table_entity = self.synapse_entity_tracker.get(
2438            synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False
2439        )
2440        manifest_table_entity.annotations = annotations_manifest_table
2441        manifest_table_entity.etag = annotations_manifest_table.etag
2442
2443        return manifest_synapse_file_id
2444
2445    @tracer.start_as_current_span("SynapseStorage::upload_manifest_as_csv")
2446    def upload_manifest_as_csv(
2447        self,
2448        dmge,
2449        manifest,
2450        metadataManifestPath,
2451        datasetId,
2452        restrict,
2453        manifest_record_type,
2454        hideBlanks,
2455        component_name,
2456        annotation_keys: str,
2457        file_annotations_upload: bool = True,
2458    ):
2459        """Upload manifest to Synapse as a csv only.
2460        Args:
2461            dmge: DataModelGraphExplorer object
2462            manifest (pd.DataFrame): loaded df containing user supplied data.
2463            metadataManifestPath: path to csv containing a validated metadata manifest.
2464            datasetId (str): synapse ID of folder containing the dataset
2465            restrict (bool): Flag for censored data.
2466            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2467            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2468            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2469                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2470                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2471            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2472        Return:
2473            manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2474        """
2475        if file_annotations_upload:
2476            manifest = asyncio.run(
2477                self.add_annotations_to_entities_files(
2478                    dmge,
2479                    manifest,
2480                    manifest_record_type,
2481                    datasetId,
2482                    hideBlanks,
2483                    annotation_keys=annotation_keys,
2484                )
2485            )
2486
2487        # Load manifest to synapse as a CSV File
2488        manifest_synapse_file_id = self.upload_manifest_file(
2489            manifest,
2490            metadataManifestPath,
2491            datasetId,
2492            restrict,
2493            component_name=component_name,
2494        )
2495
2496        # Set annotations for the file manifest.
2497        manifest_annotations = self.format_manifest_annotations(
2498            manifest, manifest_synapse_file_id
2499        )
2500        annos = self.syn.set_annotations(manifest_annotations)
2501        manifest_entity = self.synapse_entity_tracker.get(
2502            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2503        )
2504        manifest_entity.annotations = annos
2505        manifest_entity.etag = annos.etag
2506
2507        logger.info("Associated manifest file with dataset on Synapse.")
2508
2509        return manifest_synapse_file_id
2510
2511    @tracer.start_as_current_span("SynapseStorage::upload_manifest_combo")
2512    def upload_manifest_combo(
2513        self,
2514        dmge,
2515        manifest,
2516        metadataManifestPath,
2517        datasetId,
2518        table_name,
2519        component_name,
2520        restrict,
2521        manifest_record_type,
2522        hideBlanks,
2523        table_manipulation,
2524        table_column_names: str,
2525        annotation_keys: str,
2526        file_annotations_upload: bool = True,
2527    ):
2528        """Upload manifest to Synapse as a table and CSV with entities.
2529        Args:
2530            dmge: DataModelGraphExplorer object
2531            manifest (pd.DataFrame): loaded df containing user supplied data.
2532            metadataManifestPath: path to csv containing a validated metadata manifest.
2533            datasetId (str): synapse ID of folder containing the dataset
2534            table_name (str): Generated to name the table being uploaded.
2535            component_name (str): Name of the component manifest that is currently being uploaded.
2536            restrict (bool): Flag for censored data.
2537            manifest_record_type: valid values are 'entity', 'table' or 'both'. Specifies whether to create entity ids and folders for each row in a manifest, a Synapse table to house the entire manifest or do both.
2538            hideBlanks (bool): Default is False -Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2539            table_malnipulation (str): Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2540            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2541                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2542                display label formatting.
2543            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2544                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2545                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2546            file_annotations_upload (bool): Default to True. If false, do not add annotations to files.
2547        Return:
2548            manifest_synapse_file_id (str): SynID of manifest csv uploaded to synapse.
2549        """
2550        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2551            dmge=dmge,
2552            manifest=manifest,
2553            datasetId=datasetId,
2554            table_name=table_name,
2555            restrict=restrict,
2556            table_manipulation=table_manipulation,
2557            table_column_names=table_column_names,
2558        )
2559
2560        if file_annotations_upload:
2561            manifest = asyncio.run(
2562                self.add_annotations_to_entities_files(
2563                    dmge,
2564                    manifest,
2565                    manifest_record_type,
2566                    datasetId,
2567                    hideBlanks,
2568                    manifest_synapse_table_id,
2569                    annotation_keys=annotation_keys,
2570                )
2571            )
2572
2573        # Load manifest to synapse as a CSV File
2574        manifest_synapse_file_id = self.upload_manifest_file(
2575            manifest, metadataManifestPath, datasetId, restrict, component_name
2576        )
2577
2578        # Set annotations for the file manifest.
2579        manifest_annotations = self.format_manifest_annotations(
2580            manifest, manifest_synapse_file_id
2581        )
2582        file_manifest_annoations = self.syn.set_annotations(manifest_annotations)
2583        manifest_entity = self.synapse_entity_tracker.get(
2584            synapse_id=manifest_synapse_file_id, syn=self.syn, download_file=False
2585        )
2586        manifest_entity.annotations = file_manifest_annoations
2587        manifest_entity.etag = file_manifest_annoations.etag
2588        logger.info("Associated manifest file with dataset on Synapse.")
2589
2590        # Update manifest Synapse table with new entity id column.
2591        manifest_synapse_table_id, manifest, table_manifest = self.uploadDB(
2592            dmge=dmge,
2593            manifest=manifest,
2594            datasetId=datasetId,
2595            table_name=table_name,
2596            restrict=restrict,
2597            table_manipulation="update",
2598            table_column_names=table_column_names,
2599        )
2600
2601        # Set annotations for the table manifest
2602        manifest_annotations = self.format_manifest_annotations(
2603            manifest, manifest_synapse_table_id
2604        )
2605        table_manifest_annotations = self.syn.set_annotations(manifest_annotations)
2606        manifest_entity = self.synapse_entity_tracker.get(
2607            synapse_id=manifest_synapse_table_id, syn=self.syn, download_file=False
2608        )
2609        manifest_entity.annotations = table_manifest_annotations
2610        manifest_entity.etag = table_manifest_annotations.etag
2611        return manifest_synapse_file_id
2612
2613    @tracer.start_as_current_span("SynapseStorage::associateMetadataWithFiles")
2614    def associateMetadataWithFiles(
2615        self,
2616        dmge: DataModelGraphExplorer,
2617        metadataManifestPath: str,
2618        datasetId: str,
2619        manifest_record_type: str = "table_file_and_entities",
2620        hideBlanks: bool = False,
2621        restrict_manifest=False,
2622        table_manipulation: str = "replace",
2623        table_column_names: str = "class_label",
2624        annotation_keys: str = "class_label",
2625        file_annotations_upload: bool = True,
2626    ) -> str:
2627        """Associate metadata with files in a storage dataset already on Synapse.
2628        Upload metadataManifest in the storage dataset folder on Synapse as well. Return synapseId of the uploaded manifest file.
2629
2630        If this is a new manifest there could be no Synapse entities associated with the rows of this manifest
2631        this may be due to data type (e.g. clinical data) being tabular
2632        and not requiring files; to utilize uniform interfaces downstream
2633        (i.e. fileviews), a Synapse entity (a folder) is created for each row
2634        and an entity column is added to the manifest containing the resulting
2635        entity IDs; a table is also created at present as an additional interface
2636        for downstream query and interaction with the data.
2637
2638        Args:
2639            dmge: DataModelGraphExplorer Object
2640            metadataManifestPath: path to csv containing a validated metadata manifest.
2641            The manifest should include a column entityId containing synapse IDs of files/entities to be associated with metadata, if that is applicable to the dataset type.
2642            Some datasets, e.g. clinical data, do not contain file id's, but data is stored in a table: one row per item.
2643            In this case, the system creates a file on Synapse for each row in the table (e.g. patient, biospecimen) and associates the columnset data as metadata/annotations to his file.
2644            datasetId: synapse ID of folder containing the dataset
2645            manifest_record_type: Default value is 'table_file_and_entities'. valid values are 'file_only', 'file_and_entities', 'table_and_file' or 'table_file_and_entities'. 'file_and_entities' will store the manifest as a csv and create Synapse files for each row in the manifest.'table_and_file' will store the manifest as a table and a csv on Synapse. 'file_only' will store the manifest as a csv only on Synapse. 'table_file_and_entities' will perform the options file_with_entites and table in combination.
2646            hideBlanks: Default is false. Boolean flag that does not upload annotation keys with blank values when true. Uploads Annotation keys with empty string values when false.
2647            restrict_manifest (bool): Default is false. Flag for censored data.
2648            table_malnipulation (str): Default is 'replace'. Specify the way the manifest tables should be store as on Synapse when one with the same name already exists. Options are 'replace' and 'upsert'.
2649            table_column_names: (str): display_name/display_label/class_label (default). Sets labeling style for table column names. display_name will use the raw display name as the column name. class_label will format the display
2650                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2651                display label formatting.
2652            annotation_keys: (str) display_label/class_label (default), Sets labeling syle for annotation keys. class_label will format the display
2653                name as upper camelcase, and strip blacklisted characters, display_label will strip blacklisted characters including spaces, to retain
2654                display label formatting while ensuring the label is formatted properly for Synapse annotations.
2655        Returns:
2656            manifest_synapse_file_id: SynID of manifest csv uploaded to synapse.
2657        """
2658        # Read new manifest CSV:
2659        manifest = self._read_manifest(metadataManifestPath)
2660        manifest = self._add_id_columns_to_manifest(manifest, dmge)
2661
2662        table_name, component_name = self._generate_table_name(manifest)
2663
2664        # Upload manifest to synapse based on user input (manifest_record_type)
2665        if manifest_record_type == "file_only":
2666            manifest_synapse_file_id = self.upload_manifest_as_csv(
2667                dmge=dmge,
2668                manifest=manifest,
2669                metadataManifestPath=metadataManifestPath,
2670                datasetId=datasetId,
2671                restrict=restrict_manifest,
2672                hideBlanks=hideBlanks,
2673                manifest_record_type=manifest_record_type,
2674                component_name=component_name,
2675                annotation_keys=annotation_keys,
2676                file_annotations_upload=file_annotations_upload,
2677            )
2678        elif manifest_record_type == "table_and_file":
2679            manifest_synapse_file_id = self.upload_manifest_as_table(
2680                dmge=dmge,
2681                manifest=manifest,
2682                metadataManifestPath=metadataManifestPath,
2683                datasetId=datasetId,
2684                table_name=table_name,
2685                component_name=component_name,
2686                restrict=restrict_manifest,
2687                hideBlanks=hideBlanks,
2688                manifest_record_type=manifest_record_type,
2689                table_manipulation=table_manipulation,
2690                table_column_names=table_column_names,
2691                annotation_keys=annotation_keys,
2692                file_annotations_upload=file_annotations_upload,
2693            )
2694        elif manifest_record_type == "file_and_entities":
2695            manifest_synapse_file_id = self.upload_manifest_as_csv(
2696                dmge=dmge,
2697                manifest=manifest,
2698                metadataManifestPath=metadataManifestPath,
2699                datasetId=datasetId,
2700                restrict=restrict_manifest,
2701                hideBlanks=hideBlanks,
2702                manifest_record_type=manifest_record_type,
2703                component_name=component_name,
2704                annotation_keys=annotation_keys,
2705                file_annotations_upload=file_annotations_upload,
2706            )
2707        elif manifest_record_type == "table_file_and_entities":
2708            manifest_synapse_file_id = self.upload_manifest_combo(
2709                dmge=dmge,
2710                manifest=manifest,
2711                metadataManifestPath=metadataManifestPath,
2712                datasetId=datasetId,
2713                table_name=table_name,
2714                component_name=component_name,
2715                restrict=restrict_manifest,
2716                hideBlanks=hideBlanks,
2717                manifest_record_type=manifest_record_type,
2718                table_manipulation=table_manipulation,
2719                table_column_names=table_column_names,
2720                annotation_keys=annotation_keys,
2721                file_annotations_upload=file_annotations_upload,
2722            )
2723        else:
2724            raise ValueError("Please enter a valid manifest_record_type.")
2725        return manifest_synapse_file_id
2726
2727    def getTableAnnotations(self, table_id: str):
2728        """Generate dictionary of annotations for the given Synapse file.
2729        Synapse returns all custom annotations as lists since they
2730        can contain multiple values. In all cases, the values will
2731        be converted into strings and concatenated with ", ".
2732
2733        Args:
2734            fileId (str): Synapse ID for dataset file.
2735
2736        Returns:
2737            dict: Annotations as comma-separated strings.
2738        """
2739        try:
2740            entity = self.synapse_entity_tracker.get(
2741                synapse_id=table_id, syn=self.syn, download_file=False
2742            )
2743            is_table = entity.concreteType.endswith(".TableEntity")
2744            annotations_raw = entity.annotations
2745        except SynapseHTTPError:
2746            # If an error occurs with retrieving entity, skip it
2747            # This could be caused by a temporary file view that
2748            # was deleted since its ID was retrieved
2749            is_file, is_table = False, False
2750
2751        # Skip anything that isn't a file or folder
2752        if not (is_table):
2753            return None
2754
2755        annotations = self.getEntityAnnotations(table_id, entity, annotations_raw)
2756
2757        return annotations
2758
2759    def getFileAnnotations(self, fileId: str) -> Dict[str, str]:
2760        """Generate dictionary of annotations for the given Synapse file.
2761        Synapse returns all custom annotations as lists since they
2762        can contain multiple values. In all cases, the values will
2763        be converted into strings and concatenated with ", ".
2764
2765        Args:
2766            fileId (str): Synapse ID for dataset file.
2767
2768        Returns:
2769            dict: Annotations as comma-separated strings.
2770        """
2771
2772        # Get entity metadata, including annotations
2773        try:
2774            entity = self.synapse_entity_tracker.get(
2775                synapse_id=fileId, syn=self.syn, download_file=False
2776            )
2777            is_file = entity.concreteType.endswith(".FileEntity")
2778            is_folder = entity.concreteType.endswith(".Folder")
2779            annotations_raw = entity.annotations
2780        except SynapseHTTPError:
2781            # If an error occurs with retrieving entity, skip it
2782            # This could be caused by a temporary file view that
2783            # was deleted since its ID was retrieved
2784            is_file, is_folder = False, False
2785
2786        # Skip anything that isn't a file or folder
2787        if not (is_file or is_folder):
2788            return None
2789
2790        annotations = self.getEntityAnnotations(fileId, entity, annotations_raw)
2791
2792        return annotations
2793
2794    def getEntityAnnotations(self, fileId, entity, annotations_raw):
2795        # Extract annotations from their lists and stringify. For example:
2796        # {'YearofBirth': [1980], 'author': ['bruno', 'milen', 'sujay']}
2797        annotations = dict()
2798        for key, vals in annotations_raw.items():
2799            if isinstance(vals, list) and len(vals) == 1:
2800                annotations[key] = str(vals[0])
2801            else:
2802                annotations[key] = ", ".join(str(v) for v in vals)
2803
2804        # Add the file entity ID and eTag, which weren't lists
2805        assert fileId == entity.id, (
2806            "For some reason, the Synapse ID in the response doesn't match"
2807            "the Synapse ID sent in the request (via synapseclient)."
2808        )
2809        annotations["entityId"] = fileId
2810        annotations["eTag"] = entity.etag
2811
2812        return annotations
2813
2814    def getDatasetAnnotations(
2815        self, datasetId: str, fill_na: bool = True, force_batch: bool = False
2816    ) -> pd.DataFrame:
2817        """Generate table for annotations across all files in given dataset.
2818
2819        Args:
2820            datasetId (str): Synapse ID for dataset folder.
2821            fill_na (bool): Whether to replace missing values with
2822                blank strings.
2823            force_batch (bool): Whether to force the function to use
2824                the batch mode, which uses a file view to retrieve
2825                annotations for a given dataset. Default to False
2826                unless there are more than 50 files in the dataset.
2827
2828        Returns:
2829            pd.DataFrame: Table of annotations.
2830        """
2831        # Get all files in given dataset
2832        dataset_files = self.getFilesInStorageDataset(datasetId)
2833
2834        # if there are no dataset files, there are no annotations
2835        # return None
2836        if not dataset_files:
2837            return pd.DataFrame()
2838
2839        dataset_files_map = dict(dataset_files)
2840        dataset_file_ids, _ = list(zip(*dataset_files))
2841
2842        # Get annotations for each file from Step 1
2843        # Batch mode
2844        try_batch = len(dataset_files) >= 50 or force_batch
2845        if try_batch:
2846            try:
2847                logger.info("Trying batch mode for retrieving Synapse annotations")
2848                table = self.getDatasetAnnotationsBatch(datasetId, dataset_file_ids)
2849            except (SynapseAuthenticationError, SynapseHTTPError, ValueError):
2850                logger.info(
2851                    f"Unable to create a temporary file view bound to {datasetId}. "
2852                    "Defaulting to slower iterative retrieval of annotations."
2853                )
2854                # Default to the slower non-batch method
2855                logger.info("Batch mode failed (probably due to permission error)")
2856                try_batch = False
2857
2858        # Non-batch mode
2859        if not try_batch:
2860            logger.info("Using slower (non-batch) sequential mode")
2861            records = [self.getFileAnnotations(i) for i in dataset_file_ids]
2862            # Remove any annotations for non-file/folders (stored as None)
2863            records = filter(None, records)
2864            table = pd.DataFrame.from_records(records)
2865
2866        # Add filenames for the files that "survived" annotation retrieval
2867        filenames = [dataset_files_map[i] for i in table["entityId"]]
2868
2869        if "Filename" not in table.columns:
2870            table.insert(0, "Filename", filenames)
2871
2872        # Ensure that entityId and eTag are at the end
2873        entity_ids = table.pop("entityId")
2874        etags = table.pop("eTag")
2875        table.insert(len(table.columns), "entityId", entity_ids)
2876        table.insert(len(table.columns), "eTag", etags)
2877
2878        # Missing values are filled in with empty strings for Google Sheets
2879        if fill_na:
2880            table.fillna("", inplace=True)
2881
2882        # Force all values as strings
2883        return table.astype(str)
2884
2885    def raise_final_error(retry_state):
2886        return retry_state.outcome.result()
2887
2888    def checkIfinAssetView(self, syn_id) -> str:
2889        # get data in administrative fileview for this pipeline
2890        assetViewTable = self.getStorageFileviewTable()
2891        all_files = list(assetViewTable["id"])
2892        if syn_id in all_files:
2893            return True
2894        else:
2895            return False
2896
2897    @tracer.start_as_current_span("SynapseStorage::getDatasetProject")
2898    @retry(
2899        stop=stop_after_attempt(5),
2900        wait=wait_chain(
2901            *[wait_fixed(10) for i in range(2)]
2902            + [wait_fixed(15) for i in range(2)]
2903            + [wait_fixed(20)]
2904        ),
2905        retry=retry_if_exception_type(LookupError),
2906        retry_error_callback=raise_final_error,
2907    )
2908    def getDatasetProject(self, datasetId: str) -> str:
2909        """Get parent project for a given dataset ID.
2910
2911        Args:
2912            datasetId (str): Synapse entity ID (folder or project).
2913
2914        Raises:
2915            ValueError: Raised if Synapse ID cannot be retrieved
2916            by the user or if it doesn't appear in the file view.
2917
2918        Returns:
2919            str: The Synapse ID for the parent project.
2920        """
2921
2922        # Subset main file view
2923        dataset_index = self.storageFileviewTable["id"] == datasetId
2924        dataset_row = self.storageFileviewTable[dataset_index]
2925
2926        # re-query if no datasets found
2927        if dataset_row.empty:
2928            sleep(5)
2929            self.query_fileview(force_requery=True)
2930            # Subset main file view
2931            dataset_index = self.storageFileviewTable["id"] == datasetId
2932            dataset_row = self.storageFileviewTable[dataset_index]
2933
2934        # Return `projectId` for given row if only one found
2935        if len(dataset_row) == 1:
2936            dataset_project = dataset_row["projectId"].values[0]
2937            return dataset_project
2938
2939        # Otherwise, check if already project itself
2940        try:
2941            syn_object = self.synapse_entity_tracker.get(
2942                synapse_id=datasetId, syn=self.syn, download_file=False
2943            )
2944            if syn_object.properties["concreteType"].endswith("Project"):
2945                return datasetId
2946        except SynapseHTTPError:
2947            raise PermissionError(
2948                f"The given dataset ({datasetId}) isn't accessible with this "
2949                "user. This might be caused by a typo in the dataset Synapse ID."
2950            )
2951
2952        # If not, then assume dataset not in file view
2953        raise LookupError(
2954            f"The given dataset ({datasetId}) doesn't appear in the "
2955            f"configured file view ({self.storageFileview}). This might "
2956            "mean that the file view's scope needs to be updated."
2957        )
2958
2959    def getDatasetAnnotationsBatch(
2960        self, datasetId: str, dataset_file_ids: Sequence[str] = None
2961    ) -> pd.DataFrame:
2962        """Generate table for annotations across all files in given dataset.
2963        This function uses a temporary file view to generate a table
2964        instead of iteratively querying for individual entity annotations.
2965        This function is expected to run much faster than
2966        `self.getDatasetAnnotationsBatch` on large datasets.
2967
2968        Args:
2969            datasetId (str): Synapse ID for dataset folder.
2970            dataset_file_ids (Sequence[str]): List of Synapse IDs
2971                for dataset files/folders used to subset the table.
2972
2973        Returns:
2974            pd.DataFrame: Table of annotations.
2975        """
2976        # Create data frame from annotations file view
2977        with DatasetFileView(datasetId, self.syn) as fileview:
2978            table = fileview.query()
2979
2980        if dataset_file_ids:
2981            table = table.loc[table.index.intersection(dataset_file_ids)]
2982
2983        table = table.reset_index(drop=True)
2984
2985        return table
2986
2987    def _get_table_schema_by_cname(self, table_schema):
2988        # assume no duplicate column names in the table
2989        table_schema_by_cname = {}
2990
2991        for col_record in table_schema:
2992            # TODO clean up dictionary for compactness (e.g. remove redundant 'name' key)
2993            table_schema_by_cname[col_record["name"]] = col_record
2994
2995        return table_schema_by_cname
2996
2997
2998class TableOperations:
2999    """
3000    Object to hold functions for various table operations specific to the Synapse Asset Store.
3001
3002    Currently implement operations are:
3003    createTable: upload a manifest as a new table when none exist
3004    replaceTable: replace a metadata in a table from one manifest with metadata from another manifest
3005    updateTable: add a column to a table that already exists on synapse
3006
3007    Operations currently in development are:
3008    upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest
3009    """
3010
3011    def __init__(
3012        self,
3013        synStore: SynapseStorage,
3014        tableToLoad: pd.DataFrame = None,
3015        tableName: str = None,
3016        datasetId: str = None,
3017        existingTableId: str = None,
3018        restrict: bool = False,
3019        synapse_entity_tracker: SynapseEntityTracker = None,
3020    ):
3021        """
3022        Class governing table operations (creation, replacement, upserts, updates) in schematic
3023
3024        tableToLoad: manifest formatted appropriately for the table
3025        tableName: name of the table to be uploaded
3026        datasetId: synID of the dataset for the manifest
3027        existingTableId: synId of the table currently exising on synapse (if there is one)
3028        restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
3029        synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
3030
3031        """
3032        self.synStore = synStore
3033        self.tableToLoad = tableToLoad
3034        self.tableName = tableName
3035        self.datasetId = datasetId
3036        self.existingTableId = existingTableId
3037        self.restrict = restrict
3038        self.synapse_entity_tracker = synapse_entity_tracker or SynapseEntityTracker()
3039
3040    @tracer.start_as_current_span("TableOperations::createTable")
3041    def createTable(
3042        self,
3043        columnTypeDict: dict = None,
3044        specifySchema: bool = True,
3045    ):
3046        """
3047        Method to create a table from a metadata manifest and upload it to synapse
3048
3049        Args:
3050            columnTypeDict: dictionary schema for table columns: type, size, etc
3051            specifySchema: to specify a specific schema for the table format
3052
3053        Returns:
3054            table.schema.id: synID of the newly created table
3055        """
3056        datasetEntity = self.synapse_entity_tracker.get(
3057            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3058        )
3059        datasetName = datasetEntity.name
3060        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3061
3062        if not self.tableName:
3063            self.tableName = datasetName + "table"
3064        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3065        if specifySchema:
3066            if columnTypeDict == {}:
3067                logger.error("Did not provide a columnTypeDict.")
3068            # create list of columns:
3069            cols = []
3070            for col in self.tableToLoad.columns:
3071                if col in table_schema_by_cname:
3072                    col_type = table_schema_by_cname[col]["columnType"]
3073                    max_size = (
3074                        table_schema_by_cname[col]["maximumSize"]
3075                        if "maximumSize" in table_schema_by_cname[col].keys()
3076                        else 100
3077                    )
3078                    max_list_len = 250
3079                    if max_size and max_list_len:
3080                        cols.append(
3081                            Column(
3082                                name=col,
3083                                columnType=col_type,
3084                                maximumSize=max_size,
3085                                maximumListLength=max_list_len,
3086                            )
3087                        )
3088                    elif max_size:
3089                        cols.append(
3090                            Column(name=col, columnType=col_type, maximumSize=max_size)
3091                        )
3092                    else:
3093                        cols.append(Column(name=col, columnType=col_type))
3094                else:
3095                    # TODO add warning that the given col was not found and it's max size is set to 100
3096                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3097            schema = Schema(
3098                name=self.tableName, columns=cols, parent=datasetParentProject
3099            )
3100            table = Table(schema, self.tableToLoad)
3101            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3102            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3103            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3104            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3105            return table.schema.id
3106        else:
3107            # For just uploading the tables to synapse using default
3108            # column types.
3109            table = build_table(self.tableName, datasetParentProject, self.tableToLoad)
3110            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3111            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3112            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3113            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3114            return table.schema.id
3115
3116    @tracer.start_as_current_span("TableOperations::replaceTable")
3117    def replaceTable(
3118        self,
3119        specifySchema: bool = True,
3120        columnTypeDict: dict = None,
3121    ):
3122        """
3123        Method to replace an existing table on synapse with metadata from a new manifest
3124
3125        Args:
3126            specifySchema: to infer a schema for the table format
3127            columnTypeDict: dictionary schema for table columns: type, size, etc
3128
3129        Returns:
3130           existingTableId: synID of the already existing table that had its metadata replaced
3131        """
3132        datasetEntity = self.synapse_entity_tracker.get(
3133            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3134        )
3135
3136        datasetName = datasetEntity.name
3137        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3138        existing_table, existing_results = self.synStore.get_synapse_table(
3139            self.existingTableId
3140        )
3141        # remove rows
3142        self.synStore.syn.delete(existing_results)
3143        # Data changes such as removing all rows causes the eTag to change.
3144        self.synapse_entity_tracker.remove(synapse_id=self.existingTableId)
3145        # wait for row deletion to finish on synapse before getting empty table
3146        sleep(10)
3147
3148        # removes all current columns
3149        current_table = self.synapse_entity_tracker.get(
3150            synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False
3151        )
3152
3153        current_columns = self.synStore.syn.getTableColumns(current_table)
3154
3155        for col in current_columns:
3156            current_table.removeColumn(col)
3157
3158        if not self.tableName:
3159            self.tableName = datasetName + "table"
3160
3161        # Process columns according to manifest entries
3162        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3163        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3164        if specifySchema:
3165            if columnTypeDict == {}:
3166                logger.error("Did not provide a columnTypeDict.")
3167            # create list of columns:
3168            cols = []
3169
3170            for col in self.tableToLoad.columns:
3171                if col in table_schema_by_cname:
3172                    col_type = table_schema_by_cname[col]["columnType"]
3173                    max_size = (
3174                        table_schema_by_cname[col]["maximumSize"]
3175                        if "maximumSize" in table_schema_by_cname[col].keys()
3176                        else 100
3177                    )
3178                    max_list_len = 250
3179                    if max_size and max_list_len:
3180                        cols.append(
3181                            Column(
3182                                name=col,
3183                                columnType=col_type,
3184                                maximumSize=max_size,
3185                                maximumListLength=max_list_len,
3186                            )
3187                        )
3188                    elif max_size:
3189                        cols.append(
3190                            Column(name=col, columnType=col_type, maximumSize=max_size)
3191                        )
3192                    else:
3193                        cols.append(Column(name=col, columnType=col_type))
3194                else:
3195                    # TODO add warning that the given col was not found and it's max size is set to 100
3196                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3197
3198            # adds new columns to schema
3199            for col in cols:
3200                current_table.addColumn(col)
3201
3202            table_result = self.synStore.syn.store(
3203                current_table, isRestricted=self.restrict
3204            )
3205            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3206            # self.synapse_entity_tracker.add(synapse_id=table_result.schema.id, entity=table_result.schema)
3207            self.synapse_entity_tracker.remove(synapse_id=table_result.id)
3208
3209            # wait for synapse store to finish
3210            sleep(1)
3211
3212            # build schema and table from columns and store with necessary restrictions
3213            schema = Schema(
3214                name=self.tableName, columns=cols, parent=datasetParentProject
3215            )
3216            schema.id = self.existingTableId
3217            table = Table(schema, self.tableToLoad, etag=existing_results.etag)
3218            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3219            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3220            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3221            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3222        else:
3223            logging.error("Must specify a schema for table replacements")
3224
3225        # remove system metadata from manifest
3226        existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True)
3227        return self.existingTableId
3228
3229    @tracer.start_as_current_span("TableOperations::_get_auth_token")
3230    def _get_auth_token(
3231        self,
3232    ):
3233        authtoken = None
3234
3235        # Get access token from environment variable if available
3236        # Primarily useful for testing environments, with other possible usefulness for containers
3237        env_access_token = os.getenv("SYNAPSE_ACCESS_TOKEN")
3238        if env_access_token:
3239            authtoken = env_access_token
3240            return authtoken
3241
3242        # Get token from authorization header
3243        # Primarily useful for API endpoint functionality
3244        if "Authorization" in self.synStore.syn.default_headers:
3245            authtoken = self.synStore.syn.default_headers["Authorization"].split(
3246                "Bearer "
3247            )[-1]
3248            return authtoken
3249
3250        # retrive credentials from synapse object
3251        # Primarily useful for local users, could only be stored here when a .synapseConfig file is used, but including to be safe
3252        synapse_object_creds = self.synStore.syn.credentials
3253        if hasattr(synapse_object_creds, "_token"):
3254            authtoken = synapse_object_creds.secret
3255
3256        # Try getting creds from .synapseConfig file if it exists
3257        # Primarily useful for local users. Seems to correlate with credentials stored in synaspe object when logged in
3258        if os.path.exists(CONFIG.synapse_configuration_path):
3259            config = get_config_file(CONFIG.synapse_configuration_path)
3260
3261            # check which credentials are provided in file
3262            if config.has_option("authentication", "authtoken"):
3263                authtoken = config.get("authentication", "authtoken")
3264
3265        # raise error if required credentials are not found
3266        if not authtoken:
3267            raise NameError(
3268                "authtoken credentials could not be found in the environment, synapse object, or the .synapseConfig file"
3269            )
3270
3271        return authtoken
3272
3273    @tracer.start_as_current_span("TableOperations::upsertTable")
3274    def upsertTable(self, dmge: DataModelGraphExplorer):
3275        """
3276        Method to upsert rows from a new manifest into an existing table on synapse
3277        For upsert functionality to work, primary keys must follow the naming convention of <componenet>_id
3278        `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality.
3279        Currently it is required to use -tcn "display label" with table upserts.
3280
3281
3282        Args:
3283            dmge: DataModelGraphExplorer instance
3284
3285        Returns:
3286           existingTableId: synID of the already existing table that had its metadata replaced
3287        """
3288
3289        authtoken = self._get_auth_token()
3290
3291        synapseDB = SynapseDatabase(
3292            auth_token=authtoken,
3293            project_id=self.synStore.getDatasetProject(self.datasetId),
3294            syn=self.synStore.syn,
3295            synapse_entity_tracker=self.synapse_entity_tracker,
3296        )
3297
3298        try:
3299            # Try performing upsert
3300            synapseDB.upsert_table_rows(
3301                table_name=self.tableName, data=self.tableToLoad
3302            )
3303        except SynapseHTTPError as ex:
3304            # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload
3305            if "Id is not a valid column name or id" in str(ex):
3306                self._update_table_uuid_column(dmge)
3307                synapseDB.upsert_table_rows(
3308                    table_name=self.tableName, data=self.tableToLoad
3309                )
3310            # Raise if other error
3311            else:
3312                raise ex
3313
3314        return self.existingTableId
3315
3316    @tracer.start_as_current_span("TableOperations::_update_table_uuid_column")
3317    def _update_table_uuid_column(
3318        self,
3319        dmge: DataModelGraphExplorer,
3320    ) -> None:
3321        """Removes the `Uuid` column when present, and relpaces with an `Id` column
3322        Used to enable backwards compatability for manifests using the old `Uuid` convention
3323
3324        Args:
3325            dmge: DataModelGraphExplorer instance
3326
3327        Returns:
3328            None
3329        """
3330
3331        # Get the columns of the schema
3332        schema = self.synapse_entity_tracker.get(
3333            synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False
3334        )
3335
3336        cols = self.synStore.syn.getTableColumns(schema)
3337
3338        # Iterate through columns until `Uuid` column is found
3339        for col in cols:
3340            if col.name.lower() == "uuid":
3341                # See if schema has `Uuid` column specified
3342                try:
3343                    uuid_col_in_schema = dmge.is_class_in_schema(col.name)
3344                except KeyError:
3345                    uuid_col_in_schema = False
3346
3347                # If there is, then create a new `Id` column from scratch
3348                if uuid_col_in_schema:
3349                    new_col = Column(columnType="STRING", maximumSize=64, name="Id")
3350                    schema.addColumn(new_col)
3351                    schema = self.synStore.syn.store(schema)
3352                    # self.synapse_entity_tracker.add(synapse_id=schema.id, entity=schema)
3353                    # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3354                    self.synapse_entity_tracker.remove(synapse_id=schema.id)
3355                # If there is not, then use the old `Uuid` column as a basis for the new `Id` column
3356                else:
3357                    # Build ColumnModel that will be used for new column
3358                    id_column = Column(
3359                        name="Id",
3360                        columnType="STRING",
3361                        maximumSize=64,
3362                        defaultValue=None,
3363                        maximumListLength=1,
3364                    )
3365                    new_col_response = self.synStore.syn.store(id_column)
3366
3367                    # Define columnChange body
3368                    columnChangeDict = {
3369                        "concreteType": "org.sagebionetworks.repo.model.table.TableSchemaChangeRequest",
3370                        "entityId": self.existingTableId,
3371                        "changes": [
3372                            {
3373                                "oldColumnId": col["id"],
3374                                "newColumnId": new_col_response["id"],
3375                            }
3376                        ],
3377                    }
3378
3379                    self.synStore.syn._async_table_update(
3380                        table=self.existingTableId,
3381                        changes=[columnChangeDict],
3382                        wait=False,
3383                    )
3384                break
3385
3386        return
3387
3388    @tracer.start_as_current_span("TableOperations::updateTable")
3389    def updateTable(
3390        self,
3391        update_col: str = "Id",
3392    ):
3393        """
3394        Method to update an existing table with a new column
3395
3396        Args:
3397            updateCol: column to index the old and new tables on
3398
3399        Returns:
3400           existingTableId: synID of the already existing table that had its metadata replaced
3401        """
3402        existing_table, existing_results = self.synStore.get_synapse_table(
3403            self.existingTableId
3404        )
3405
3406        self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col)
3407        # store table with existing etag data and impose restrictions as appropriate
3408        table_result = self.synStore.syn.store(
3409            Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag),
3410            isRestricted=self.restrict,
3411        )
3412        # We cannot store the Table to the `synapse_entity_tracker` because there is
3413        # not `Schema` on the table object. The above `.store()` function call would
3414        # also update the ETag of the entity within Synapse. Remove it from the tracker
3415        # and re-retrieve it later on if needed again.
3416        self.synapse_entity_tracker.remove(synapse_id=table_result.tableId)
3417
3418        return self.existingTableId
3419
3420
3421class DatasetFileView:
3422    """Helper class to create temporary dataset file views.
3423    This class can be used in conjunction with a 'with' statement.
3424    This will ensure that the file view is deleted automatically.
3425    See SynapseStorage.getDatasetAnnotationsBatch for example usage.
3426    """
3427
3428    def __init__(
3429        self,
3430        datasetId: str,
3431        synapse: Synapse,
3432        name: str = None,
3433        temporary: bool = True,
3434        parentId: str = None,
3435    ) -> None:
3436        """Create a file view scoped to a dataset folder.
3437
3438        Args:
3439            datasetId (str): Synapse ID for a dataset folder/project.
3440            synapse (Synapse): Used for Synapse requests.
3441            name (str): Name of the file view (temporary or not).
3442            temporary (bool): Whether to delete the file view on exit
3443                of either a 'with' statement or Python entirely.
3444            parentId (str, optional): Synapse ID specifying where to
3445                store the file view. Defaults to datasetId.
3446        """
3447
3448        self.datasetId = datasetId
3449        self.synapse = synapse
3450        self.is_temporary = temporary
3451
3452        if name is None:
3453            self.name = f"schematic annotation file view for {self.datasetId}"
3454
3455        if self.is_temporary:
3456            uid = secrets.token_urlsafe(5)
3457            self.name = f"{self.name} - UID {uid}"
3458
3459        # TODO: Allow a DCC admin to configure a "universal parent"
3460        #       Such as a Synapse project writeable by everyone.
3461        self.parentId = datasetId if parentId is None else parentId
3462
3463        # TODO: Create local sharing setting to hide from everyone else
3464        view_schema = EntityViewSchema(
3465            name=self.name,
3466            parent=self.parentId,
3467            scopes=self.datasetId,
3468            includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER],
3469            addDefaultViewColumns=False,
3470            addAnnotationColumns=True,
3471        )
3472
3473        # TODO: Handle failure due to insufficient permissions by
3474        #       creating a temporary new project to store view
3475        self.view_schema = self.synapse.store(view_schema)
3476
3477        # These are filled in after calling `self.query()`
3478        self.results = None
3479        self.table = None
3480
3481        # Ensure deletion of the file view (last resort)
3482        if self.is_temporary:
3483            atexit.register(self.delete)
3484
3485    def __enter__(self):
3486        """Return file view when entering 'with' statement."""
3487        return self
3488
3489    def __exit__(self, exc_type, exc_value, traceback):
3490        """Delete file view when exiting 'with' statement."""
3491        if self.is_temporary:
3492            self.delete()
3493
3494    def delete(self):
3495        """Delete the file view on Synapse without deleting local table."""
3496        if self.view_schema is not None:
3497            self.synapse.delete(self.view_schema)
3498            self.view_schema = None
3499
3500    def query(self, tidy=True, force=False):
3501        """Retrieve file view as a data frame (raw format sans index)."""
3502        if self.table is None or force:
3503            fileview_id = self.view_schema["id"]
3504            self.results = self.synapse.tableQuery(f"select * from {fileview_id}")
3505            self.table = self.results.asDataFrame(
3506                rowIdAndVersionInIndex=False,
3507                na_values=STR_NA_VALUES_FILTERED,
3508                keep_default_na=False,
3509            )
3510        if tidy:
3511            self.tidy_table()
3512        return self.table
3513
3514    def tidy_table(self):
3515        """Convert raw file view data frame into more usable format."""
3516        assert self.table is not None, "Must call `self.query()` first."
3517        self._fix_default_columns()
3518        self._fix_list_columns()
3519        self._fix_int_columns()
3520        return self.table
3521
3522    def _fix_default_columns(self):
3523        """Rename default columns to match schematic expectations."""
3524
3525        # Drop ROW_VERSION column if present
3526        if "ROW_VERSION" in self.table:
3527            del self.table["ROW_VERSION"]
3528
3529        # Rename id column to entityId and set as data frame index
3530        if "ROW_ID" in self.table:
3531            self.table["entityId"] = "syn" + self.table["ROW_ID"].astype(str)
3532            self.table = self.table.set_index("entityId", drop=False)
3533            del self.table["ROW_ID"]
3534
3535        # Rename ROW_ETAG column to eTag and place at end of data frame
3536        if "ROW_ETAG" in self.table:
3537            row_etags = self.table.pop("ROW_ETAG")
3538
3539            # eTag column may already present if users annotated data without submitting manifest
3540            # we're only concerned with the new values and not the existing ones
3541            if "eTag" in self.table:
3542                del self.table["eTag"]
3543
3544            self.table.insert(len(self.table.columns), "eTag", row_etags)
3545
3546        return self.table
3547
3548    def _get_columns_of_type(self, types):
3549        """Helper function to get list of columns of a given type(s)."""
3550        matching_columns = []
3551        for header in self.results.headers:
3552            if header.columnType in types:
3553                matching_columns.append(header.name)
3554        return matching_columns
3555
3556    def _fix_list_columns(self):
3557        """Fix formatting of list-columns."""
3558        list_types = {"STRING_LIST", "INTEGER_LIST", "BOOLEAN_LIST"}
3559        list_columns = self._get_columns_of_type(list_types)
3560        for col in list_columns:
3561            self.table[col] = self.table[col].apply(lambda x: ", ".join(x))
3562        return self.table
3563
3564    def _fix_int_columns(self):
3565        """Ensure that integer-columns are actually integers."""
3566        int_columns = self._get_columns_of_type({"INTEGER"})
3567        for col in int_columns:
3568            # Coercing to string because NaN is a floating point value
3569            # and cannot exist alongside integers in a column
3570            def to_int_fn(x):
3571                return "" if np.isnan(x) else str(int(x))
3572
3573            self.table[col] = self.table[col].apply(to_int_fn)
3574        return self.table

logger = <Logger Synapse storage (WARNING)>

tracer = <opentelemetry.sdk.trace.Tracer object>

ID_COLUMN = 'Id'

ENTITY_ID_COLUMN = 'entityId'

UUID_COLUMN = 'uuid'

@dataclass

class ManifestDownload: View Source

 91@dataclass
 92class ManifestDownload(object):
 93    """
 94    syn: an object of type synapseclient.
 95    manifest_id: id of a manifest
 96    synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
 97    """
 98
 99    syn: synapseclient.Synapse
100    manifest_id: str
101    synapse_entity_tracker: SynapseEntityTracker = field(
102        default_factory=SynapseEntityTracker
103    )
104
105    def _download_manifest_to_folder(self, use_temporary_folder: bool = True) -> File:
106        """
107        Try downloading a manifest to a specific folder (temporary or not). When the
108        `use_temporary_folder` is set to True, the manifest will be downloaded to a
109        temporary folder. This is useful for when the code is running as an API server
110        where multiple requests are being made at the same time. This will prevent
111        multiple requests from overwriting the same manifest file. When the
112        `use_temporary_folder` is set to False, the manifest will be downloaded to the
113        default manifest folder.
114
115        Args:
116            use_temporary_folder: boolean argument indicating if a temporary folder
117                should be used to store the manifest file. This is useful when running
118                this code as an API server where multiple requests could be made at the
119                same time. This is set to False when the code is being used from the
120                CLI. Defaults to True.
121
122        Return:
123            manifest_data: A Synapse file entity of the downloaded manifest
124        """
125        manifest_data = self.synapse_entity_tracker.get(
126            synapse_id=self.manifest_id,
127            syn=self.syn,
128            download_file=False,
129            retrieve_if_not_present=False,
130        )
131        current_span = trace.get_current_span()
132        if (
133            manifest_data
134            and (file_handle := manifest_data.get("_file_handle", None))
135            and current_span.is_recording()
136        ):
137            current_span.set_attribute(
138                "schematic.manifest_size", file_handle.get("contentSize", 0)
139            )
140
141        if manifest_data and manifest_data.path:
142            return manifest_data
143
144        if "SECRETS_MANAGER_SECRETS" in os.environ:
145            temporary_manifest_storage = "/var/tmp/temp_manifest_download"
146            cleanup_temporary_storage(
147                temporary_manifest_storage, time_delta_seconds=3600
148            )
149            # create a new directory to store manifest
150            if not os.path.exists(temporary_manifest_storage):
151                os.mkdir(temporary_manifest_storage)
152            # create temporary folders for storing manifests
153            download_location = create_temp_folder(
154                path=temporary_manifest_storage,
155                prefix=f"{self.manifest_id}-{time.time()}-",
156            )
157        else:
158            if use_temporary_folder:
159                download_location = create_temp_folder(
160                    path=CONFIG.manifest_folder,
161                    prefix=f"{self.manifest_id}-{time.time()}-",
162                )
163            else:
164                download_location = CONFIG.manifest_folder
165
166        manifest_data = self.synapse_entity_tracker.get(
167            synapse_id=self.manifest_id,
168            syn=self.syn,
169            download_file=True,
170            retrieve_if_not_present=True,
171            download_location=download_location,
172        )
173
174        # This is doing a rename of the downloaded file. The reason this is important
175        # is that if we are re-using a file that was previously downloaded, but the
176        # file had been renamed. The file downloaded from the Synapse client is just
177        # a direct copy of that renamed file. This code will set the name of the file
178        # to the original name that was used to download the file. Note: An MD5 checksum
179        # of the file will still be performed so if the file has changed, it will be
180        # downloaded again.
181        filename = manifest_data._file_handle.fileName
182        if filename != os.path.basename(manifest_data.path):
183            parent_folder = os.path.dirname(manifest_data.path)
184            manifest_original_name_and_path = os.path.join(parent_folder, filename)
185
186            self.syn.cache.remove(
187                file_handle_id=manifest_data.dataFileHandleId, path=manifest_data.path
188            )
189            os.rename(manifest_data.path, manifest_original_name_and_path)
190            manifest_data.path = manifest_original_name_and_path
191            self.syn.cache.add(
192                file_handle_id=manifest_data.dataFileHandleId,
193                path=manifest_original_name_and_path,
194                md5=manifest_data._file_handle.contentMd5,
195            )
196
197        return manifest_data
198
199    def _entity_type_checking(self) -> str:
200        """
201        check the entity type of the id that needs to be downloaded
202        Return:
203             if the entity type is wrong, raise an error
204        """
205        # check the type of entity
206        entity_type = entity_type_mapping(
207            syn=self.syn,
208            entity_id=self.manifest_id,
209            synapse_entity_tracker=self.synapse_entity_tracker,
210        )
211        if entity_type != "file":
212            logger.error(
213                f"You are using entity type: {entity_type}. Please provide a file ID"
214            )
215
216    def download_manifest(
217        self,
218        newManifestName: str = "",
219        manifest_df: pd.DataFrame = pd.DataFrame(),
220        use_temporary_folder: bool = True,
221    ) -> Union[str, File]:
222        """
223        Download a manifest based on a given manifest id.
224        Args:
225            newManifestName(optional): new name of a manifest that gets downloaded.
226            manifest_df(optional): a dataframe containing name and id of manifests in a given asset view
227        Return:
228            manifest_data: synapse entity file object
229        """
230
231        # enables retrying if user does not have access to uncensored manifest
232        # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location
233        manifest_data = ""
234
235        # check entity type
236        self._entity_type_checking()
237
238        # download a manifest
239        try:
240            manifest_data = self._download_manifest_to_folder(
241                use_temporary_folder=use_temporary_folder
242            )
243        except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError):
244            # if there's an error getting an uncensored manifest, try getting the censored manifest
245            if not manifest_df.empty:
246                censored_regex = re.compile(".*censored.*")
247                censored = manifest_df["name"].str.contains(censored_regex)
248                new_manifest_id = manifest_df[censored]["id"][0]
249                self.manifest_id = new_manifest_id
250                try:
251                    manifest_data = self._download_manifest_to_folder(
252                        use_temporary_folder=use_temporary_folder
253                    )
254                except (
255                    SynapseUnmetAccessRestrictions,
256                    SynapseAuthenticationError,
257                ) as e:
258                    raise PermissionError(
259                        "You don't have access to censored and uncensored manifests in this dataset."
260                    ) from e
261            else:
262                logger.error(
263                    f"You don't have access to the requested resource: {self.manifest_id}"
264                )
265
266        if newManifestName and os.path.exists(manifest_data.get("path")):
267            # Rename the file we just made to the new name
268            new_manifest_filename = newManifestName + ".csv"
269
270            # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest.
271            parent_folder = os.path.dirname(manifest_data.get("path"))
272
273            new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename)
274
275            # Copy file to new location. The purpose of using a copy instead of a rename
276            # is to avoid any potential issues with the file being used in another
277            # process. This avoids any potential race or code cocurrency conditions.
278            shutil.copyfile(src=manifest_data["path"], dst=new_manifest_path_name)
279
280            # Adding this to cache will allow us to re-use the already downloaded
281            # manifest file for up to 1 hour.
282            self.syn.cache.add(
283                file_handle_id=manifest_data.dataFileHandleId,
284                path=new_manifest_path_name,
285                md5=manifest_data._file_handle.contentMd5,
286            )
287
288            # Update file names/paths in manifest_data
289            manifest_data["name"] = new_manifest_filename
290            manifest_data["filename"] = new_manifest_filename
291            manifest_data["path"] = new_manifest_path_name
292
293        return manifest_data

syn: an object of type synapseclient. manifest_id: id of a manifest synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities

ManifestDownload( syn: synapseclient.client.Synapse, manifest_id: str, synapse_entity_tracker: schematic.store.synapse_tracker.SynapseEntityTracker = <factory>)

syn: synapseclient.client.Synapse

manifest_id: str

synapse_entity_tracker: schematic.store.synapse_tracker.SynapseEntityTracker

def download_manifest( self, newManifestName: str = '', manifest_df: pandas.core.frame.DataFrame = Empty DataFrame Columns: [] Index: [], use_temporary_folder: bool = True) -> Union[str, synapseclient.entity.File]: View Source

216    def download_manifest(
217        self,
218        newManifestName: str = "",
219        manifest_df: pd.DataFrame = pd.DataFrame(),
220        use_temporary_folder: bool = True,
221    ) -> Union[str, File]:
222        """
223        Download a manifest based on a given manifest id.
224        Args:
225            newManifestName(optional): new name of a manifest that gets downloaded.
226            manifest_df(optional): a dataframe containing name and id of manifests in a given asset view
227        Return:
228            manifest_data: synapse entity file object
229        """
230
231        # enables retrying if user does not have access to uncensored manifest
232        # pass synID to synapseclient.Synapse.get() method to download (and overwrite) file to a location
233        manifest_data = ""
234
235        # check entity type
236        self._entity_type_checking()
237
238        # download a manifest
239        try:
240            manifest_data = self._download_manifest_to_folder(
241                use_temporary_folder=use_temporary_folder
242            )
243        except (SynapseUnmetAccessRestrictions, SynapseAuthenticationError):
244            # if there's an error getting an uncensored manifest, try getting the censored manifest
245            if not manifest_df.empty:
246                censored_regex = re.compile(".*censored.*")
247                censored = manifest_df["name"].str.contains(censored_regex)
248                new_manifest_id = manifest_df[censored]["id"][0]
249                self.manifest_id = new_manifest_id
250                try:
251                    manifest_data = self._download_manifest_to_folder(
252                        use_temporary_folder=use_temporary_folder
253                    )
254                except (
255                    SynapseUnmetAccessRestrictions,
256                    SynapseAuthenticationError,
257                ) as e:
258                    raise PermissionError(
259                        "You don't have access to censored and uncensored manifests in this dataset."
260                    ) from e
261            else:
262                logger.error(
263                    f"You don't have access to the requested resource: {self.manifest_id}"
264                )
265
266        if newManifestName and os.path.exists(manifest_data.get("path")):
267            # Rename the file we just made to the new name
268            new_manifest_filename = newManifestName + ".csv"
269
270            # get location of existing manifest. The manifest that will be renamed should live in the same folder as existing manifest.
271            parent_folder = os.path.dirname(manifest_data.get("path"))
272
273            new_manifest_path_name = os.path.join(parent_folder, new_manifest_filename)
274
275            # Copy file to new location. The purpose of using a copy instead of a rename
276            # is to avoid any potential issues with the file being used in another
277            # process. This avoids any potential race or code cocurrency conditions.
278            shutil.copyfile(src=manifest_data["path"], dst=new_manifest_path_name)
279
280            # Adding this to cache will allow us to re-use the already downloaded
281            # manifest file for up to 1 hour.
282            self.syn.cache.add(
283                file_handle_id=manifest_data.dataFileHandleId,
284                path=new_manifest_path_name,
285                md5=manifest_data._file_handle.contentMd5,
286            )
287
288            # Update file names/paths in manifest_data
289            manifest_data["name"] = new_manifest_filename
290            manifest_data["filename"] = new_manifest_filename
291            manifest_data["path"] = new_manifest_path_name
292
293        return manifest_data

Download a manifest based on a given manifest id.

Arguments:

newManifestName(optional): new name of a manifest that gets downloaded.
manifest_df(optional): a dataframe containing name and id of manifests in a given asset view

Return:

manifest_data: synapse entity file object

class TableOperations: View Source

2999class TableOperations:
3000    """
3001    Object to hold functions for various table operations specific to the Synapse Asset Store.
3002
3003    Currently implement operations are:
3004    createTable: upload a manifest as a new table when none exist
3005    replaceTable: replace a metadata in a table from one manifest with metadata from another manifest
3006    updateTable: add a column to a table that already exists on synapse
3007
3008    Operations currently in development are:
3009    upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest
3010    """
3011
3012    def __init__(
3013        self,
3014        synStore: SynapseStorage,
3015        tableToLoad: pd.DataFrame = None,
3016        tableName: str = None,
3017        datasetId: str = None,
3018        existingTableId: str = None,
3019        restrict: bool = False,
3020        synapse_entity_tracker: SynapseEntityTracker = None,
3021    ):
3022        """
3023        Class governing table operations (creation, replacement, upserts, updates) in schematic
3024
3025        tableToLoad: manifest formatted appropriately for the table
3026        tableName: name of the table to be uploaded
3027        datasetId: synID of the dataset for the manifest
3028        existingTableId: synId of the table currently exising on synapse (if there is one)
3029        restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
3030        synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
3031
3032        """
3033        self.synStore = synStore
3034        self.tableToLoad = tableToLoad
3035        self.tableName = tableName
3036        self.datasetId = datasetId
3037        self.existingTableId = existingTableId
3038        self.restrict = restrict
3039        self.synapse_entity_tracker = synapse_entity_tracker or SynapseEntityTracker()
3040
3041    @tracer.start_as_current_span("TableOperations::createTable")
3042    def createTable(
3043        self,
3044        columnTypeDict: dict = None,
3045        specifySchema: bool = True,
3046    ):
3047        """
3048        Method to create a table from a metadata manifest and upload it to synapse
3049
3050        Args:
3051            columnTypeDict: dictionary schema for table columns: type, size, etc
3052            specifySchema: to specify a specific schema for the table format
3053
3054        Returns:
3055            table.schema.id: synID of the newly created table
3056        """
3057        datasetEntity = self.synapse_entity_tracker.get(
3058            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3059        )
3060        datasetName = datasetEntity.name
3061        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3062
3063        if not self.tableName:
3064            self.tableName = datasetName + "table"
3065        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3066        if specifySchema:
3067            if columnTypeDict == {}:
3068                logger.error("Did not provide a columnTypeDict.")
3069            # create list of columns:
3070            cols = []
3071            for col in self.tableToLoad.columns:
3072                if col in table_schema_by_cname:
3073                    col_type = table_schema_by_cname[col]["columnType"]
3074                    max_size = (
3075                        table_schema_by_cname[col]["maximumSize"]
3076                        if "maximumSize" in table_schema_by_cname[col].keys()
3077                        else 100
3078                    )
3079                    max_list_len = 250
3080                    if max_size and max_list_len:
3081                        cols.append(
3082                            Column(
3083                                name=col,
3084                                columnType=col_type,
3085                                maximumSize=max_size,
3086                                maximumListLength=max_list_len,
3087                            )
3088                        )
3089                    elif max_size:
3090                        cols.append(
3091                            Column(name=col, columnType=col_type, maximumSize=max_size)
3092                        )
3093                    else:
3094                        cols.append(Column(name=col, columnType=col_type))
3095                else:
3096                    # TODO add warning that the given col was not found and it's max size is set to 100
3097                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3098            schema = Schema(
3099                name=self.tableName, columns=cols, parent=datasetParentProject
3100            )
3101            table = Table(schema, self.tableToLoad)
3102            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3103            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3104            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3105            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3106            return table.schema.id
3107        else:
3108            # For just uploading the tables to synapse using default
3109            # column types.
3110            table = build_table(self.tableName, datasetParentProject, self.tableToLoad)
3111            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3112            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3113            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3114            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3115            return table.schema.id
3116
3117    @tracer.start_as_current_span("TableOperations::replaceTable")
3118    def replaceTable(
3119        self,
3120        specifySchema: bool = True,
3121        columnTypeDict: dict = None,
3122    ):
3123        """
3124        Method to replace an existing table on synapse with metadata from a new manifest
3125
3126        Args:
3127            specifySchema: to infer a schema for the table format
3128            columnTypeDict: dictionary schema for table columns: type, size, etc
3129
3130        Returns:
3131           existingTableId: synID of the already existing table that had its metadata replaced
3132        """
3133        datasetEntity = self.synapse_entity_tracker.get(
3134            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3135        )
3136
3137        datasetName = datasetEntity.name
3138        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3139        existing_table, existing_results = self.synStore.get_synapse_table(
3140            self.existingTableId
3141        )
3142        # remove rows
3143        self.synStore.syn.delete(existing_results)
3144        # Data changes such as removing all rows causes the eTag to change.
3145        self.synapse_entity_tracker.remove(synapse_id=self.existingTableId)
3146        # wait for row deletion to finish on synapse before getting empty table
3147        sleep(10)
3148
3149        # removes all current columns
3150        current_table = self.synapse_entity_tracker.get(
3151            synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False
3152        )
3153
3154        current_columns = self.synStore.syn.getTableColumns(current_table)
3155
3156        for col in current_columns:
3157            current_table.removeColumn(col)
3158
3159        if not self.tableName:
3160            self.tableName = datasetName + "table"
3161
3162        # Process columns according to manifest entries
3163        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3164        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3165        if specifySchema:
3166            if columnTypeDict == {}:
3167                logger.error("Did not provide a columnTypeDict.")
3168            # create list of columns:
3169            cols = []
3170
3171            for col in self.tableToLoad.columns:
3172                if col in table_schema_by_cname:
3173                    col_type = table_schema_by_cname[col]["columnType"]
3174                    max_size = (
3175                        table_schema_by_cname[col]["maximumSize"]
3176                        if "maximumSize" in table_schema_by_cname[col].keys()
3177                        else 100
3178                    )
3179                    max_list_len = 250
3180                    if max_size and max_list_len:
3181                        cols.append(
3182                            Column(
3183                                name=col,
3184                                columnType=col_type,
3185                                maximumSize=max_size,
3186                                maximumListLength=max_list_len,
3187                            )
3188                        )
3189                    elif max_size:
3190                        cols.append(
3191                            Column(name=col, columnType=col_type, maximumSize=max_size)
3192                        )
3193                    else:
3194                        cols.append(Column(name=col, columnType=col_type))
3195                else:
3196                    # TODO add warning that the given col was not found and it's max size is set to 100
3197                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3198
3199            # adds new columns to schema
3200            for col in cols:
3201                current_table.addColumn(col)
3202
3203            table_result = self.synStore.syn.store(
3204                current_table, isRestricted=self.restrict
3205            )
3206            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3207            # self.synapse_entity_tracker.add(synapse_id=table_result.schema.id, entity=table_result.schema)
3208            self.synapse_entity_tracker.remove(synapse_id=table_result.id)
3209
3210            # wait for synapse store to finish
3211            sleep(1)
3212
3213            # build schema and table from columns and store with necessary restrictions
3214            schema = Schema(
3215                name=self.tableName, columns=cols, parent=datasetParentProject
3216            )
3217            schema.id = self.existingTableId
3218            table = Table(schema, self.tableToLoad, etag=existing_results.etag)
3219            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3220            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3221            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3222            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3223        else:
3224            logging.error("Must specify a schema for table replacements")
3225
3226        # remove system metadata from manifest
3227        existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True)
3228        return self.existingTableId
3229
3230    @tracer.start_as_current_span("TableOperations::_get_auth_token")
3231    def _get_auth_token(
3232        self,
3233    ):
3234        authtoken = None
3235
3236        # Get access token from environment variable if available
3237        # Primarily useful for testing environments, with other possible usefulness for containers
3238        env_access_token = os.getenv("SYNAPSE_ACCESS_TOKEN")
3239        if env_access_token:
3240            authtoken = env_access_token
3241            return authtoken
3242
3243        # Get token from authorization header
3244        # Primarily useful for API endpoint functionality
3245        if "Authorization" in self.synStore.syn.default_headers:
3246            authtoken = self.synStore.syn.default_headers["Authorization"].split(
3247                "Bearer "
3248            )[-1]
3249            return authtoken
3250
3251        # retrive credentials from synapse object
3252        # Primarily useful for local users, could only be stored here when a .synapseConfig file is used, but including to be safe
3253        synapse_object_creds = self.synStore.syn.credentials
3254        if hasattr(synapse_object_creds, "_token"):
3255            authtoken = synapse_object_creds.secret
3256
3257        # Try getting creds from .synapseConfig file if it exists
3258        # Primarily useful for local users. Seems to correlate with credentials stored in synaspe object when logged in
3259        if os.path.exists(CONFIG.synapse_configuration_path):
3260            config = get_config_file(CONFIG.synapse_configuration_path)
3261
3262            # check which credentials are provided in file
3263            if config.has_option("authentication", "authtoken"):
3264                authtoken = config.get("authentication", "authtoken")
3265
3266        # raise error if required credentials are not found
3267        if not authtoken:
3268            raise NameError(
3269                "authtoken credentials could not be found in the environment, synapse object, or the .synapseConfig file"
3270            )
3271
3272        return authtoken
3273
3274    @tracer.start_as_current_span("TableOperations::upsertTable")
3275    def upsertTable(self, dmge: DataModelGraphExplorer):
3276        """
3277        Method to upsert rows from a new manifest into an existing table on synapse
3278        For upsert functionality to work, primary keys must follow the naming convention of <componenet>_id
3279        `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality.
3280        Currently it is required to use -tcn "display label" with table upserts.
3281
3282
3283        Args:
3284            dmge: DataModelGraphExplorer instance
3285
3286        Returns:
3287           existingTableId: synID of the already existing table that had its metadata replaced
3288        """
3289
3290        authtoken = self._get_auth_token()
3291
3292        synapseDB = SynapseDatabase(
3293            auth_token=authtoken,
3294            project_id=self.synStore.getDatasetProject(self.datasetId),
3295            syn=self.synStore.syn,
3296            synapse_entity_tracker=self.synapse_entity_tracker,
3297        )
3298
3299        try:
3300            # Try performing upsert
3301            synapseDB.upsert_table_rows(
3302                table_name=self.tableName, data=self.tableToLoad
3303            )
3304        except SynapseHTTPError as ex:
3305            # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload
3306            if "Id is not a valid column name or id" in str(ex):
3307                self._update_table_uuid_column(dmge)
3308                synapseDB.upsert_table_rows(
3309                    table_name=self.tableName, data=self.tableToLoad
3310                )
3311            # Raise if other error
3312            else:
3313                raise ex
3314
3315        return self.existingTableId
3316
3317    @tracer.start_as_current_span("TableOperations::_update_table_uuid_column")
3318    def _update_table_uuid_column(
3319        self,
3320        dmge: DataModelGraphExplorer,
3321    ) -> None:
3322        """Removes the `Uuid` column when present, and relpaces with an `Id` column
3323        Used to enable backwards compatability for manifests using the old `Uuid` convention
3324
3325        Args:
3326            dmge: DataModelGraphExplorer instance
3327
3328        Returns:
3329            None
3330        """
3331
3332        # Get the columns of the schema
3333        schema = self.synapse_entity_tracker.get(
3334            synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False
3335        )
3336
3337        cols = self.synStore.syn.getTableColumns(schema)
3338
3339        # Iterate through columns until `Uuid` column is found
3340        for col in cols:
3341            if col.name.lower() == "uuid":
3342                # See if schema has `Uuid` column specified
3343                try:
3344                    uuid_col_in_schema = dmge.is_class_in_schema(col.name)
3345                except KeyError:
3346                    uuid_col_in_schema = False
3347
3348                # If there is, then create a new `Id` column from scratch
3349                if uuid_col_in_schema:
3350                    new_col = Column(columnType="STRING", maximumSize=64, name="Id")
3351                    schema.addColumn(new_col)
3352                    schema = self.synStore.syn.store(schema)
3353                    # self.synapse_entity_tracker.add(synapse_id=schema.id, entity=schema)
3354                    # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3355                    self.synapse_entity_tracker.remove(synapse_id=schema.id)
3356                # If there is not, then use the old `Uuid` column as a basis for the new `Id` column
3357                else:
3358                    # Build ColumnModel that will be used for new column
3359                    id_column = Column(
3360                        name="Id",
3361                        columnType="STRING",
3362                        maximumSize=64,
3363                        defaultValue=None,
3364                        maximumListLength=1,
3365                    )
3366                    new_col_response = self.synStore.syn.store(id_column)
3367
3368                    # Define columnChange body
3369                    columnChangeDict = {
3370                        "concreteType": "org.sagebionetworks.repo.model.table.TableSchemaChangeRequest",
3371                        "entityId": self.existingTableId,
3372                        "changes": [
3373                            {
3374                                "oldColumnId": col["id"],
3375                                "newColumnId": new_col_response["id"],
3376                            }
3377                        ],
3378                    }
3379
3380                    self.synStore.syn._async_table_update(
3381                        table=self.existingTableId,
3382                        changes=[columnChangeDict],
3383                        wait=False,
3384                    )
3385                break
3386
3387        return
3388
3389    @tracer.start_as_current_span("TableOperations::updateTable")
3390    def updateTable(
3391        self,
3392        update_col: str = "Id",
3393    ):
3394        """
3395        Method to update an existing table with a new column
3396
3397        Args:
3398            updateCol: column to index the old and new tables on
3399
3400        Returns:
3401           existingTableId: synID of the already existing table that had its metadata replaced
3402        """
3403        existing_table, existing_results = self.synStore.get_synapse_table(
3404            self.existingTableId
3405        )
3406
3407        self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col)
3408        # store table with existing etag data and impose restrictions as appropriate
3409        table_result = self.synStore.syn.store(
3410            Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag),
3411            isRestricted=self.restrict,
3412        )
3413        # We cannot store the Table to the `synapse_entity_tracker` because there is
3414        # not `Schema` on the table object. The above `.store()` function call would
3415        # also update the ETag of the entity within Synapse. Remove it from the tracker
3416        # and re-retrieve it later on if needed again.
3417        self.synapse_entity_tracker.remove(synapse_id=table_result.tableId)
3418
3419        return self.existingTableId

Object to hold functions for various table operations specific to the Synapse Asset Store.

Currently implement operations are: createTable: upload a manifest as a new table when none exist replaceTable: replace a metadata in a table from one manifest with metadata from another manifest updateTable: add a column to a table that already exists on synapse

Operations currently in development are: upsertTable: add metadata from a manifest to an existing table that contains metadata from another manifest

TableOperations( synStore: SynapseStorage, tableToLoad: pandas.core.frame.DataFrame = None, tableName: str = None, datasetId: str = None, existingTableId: str = None, restrict: bool = False, synapse_entity_tracker: schematic.store.synapse_tracker.SynapseEntityTracker = None) View Source

3012    def __init__(
3013        self,
3014        synStore: SynapseStorage,
3015        tableToLoad: pd.DataFrame = None,
3016        tableName: str = None,
3017        datasetId: str = None,
3018        existingTableId: str = None,
3019        restrict: bool = False,
3020        synapse_entity_tracker: SynapseEntityTracker = None,
3021    ):
3022        """
3023        Class governing table operations (creation, replacement, upserts, updates) in schematic
3024
3025        tableToLoad: manifest formatted appropriately for the table
3026        tableName: name of the table to be uploaded
3027        datasetId: synID of the dataset for the manifest
3028        existingTableId: synId of the table currently exising on synapse (if there is one)
3029        restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions
3030        synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities
3031
3032        """
3033        self.synStore = synStore
3034        self.tableToLoad = tableToLoad
3035        self.tableName = tableName
3036        self.datasetId = datasetId
3037        self.existingTableId = existingTableId
3038        self.restrict = restrict
3039        self.synapse_entity_tracker = synapse_entity_tracker or SynapseEntityTracker()

Class governing table operations (creation, replacement, upserts, updates) in schematic

tableToLoad: manifest formatted appropriately for the table tableName: name of the table to be uploaded datasetId: synID of the dataset for the manifest existingTableId: synId of the table currently exising on synapse (if there is one) restrict: bool, whether or not the manifest contains sensitive data that will need additional access restrictions synapse_entity_tracker: Tracker for a pull-through cache of Synapse entities

synStore

tableToLoad

tableName

datasetId

existingTableId

restrict

synapse_entity_tracker

@tracer.start_as_current_span('TableOperations::createTable')

def createTable(self, columnTypeDict: dict = None, specifySchema: bool = True): View Source

3041    @tracer.start_as_current_span("TableOperations::createTable")
3042    def createTable(
3043        self,
3044        columnTypeDict: dict = None,
3045        specifySchema: bool = True,
3046    ):
3047        """
3048        Method to create a table from a metadata manifest and upload it to synapse
3049
3050        Args:
3051            columnTypeDict: dictionary schema for table columns: type, size, etc
3052            specifySchema: to specify a specific schema for the table format
3053
3054        Returns:
3055            table.schema.id: synID of the newly created table
3056        """
3057        datasetEntity = self.synapse_entity_tracker.get(
3058            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3059        )
3060        datasetName = datasetEntity.name
3061        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3062
3063        if not self.tableName:
3064            self.tableName = datasetName + "table"
3065        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3066        if specifySchema:
3067            if columnTypeDict == {}:
3068                logger.error("Did not provide a columnTypeDict.")
3069            # create list of columns:
3070            cols = []
3071            for col in self.tableToLoad.columns:
3072                if col in table_schema_by_cname:
3073                    col_type = table_schema_by_cname[col]["columnType"]
3074                    max_size = (
3075                        table_schema_by_cname[col]["maximumSize"]
3076                        if "maximumSize" in table_schema_by_cname[col].keys()
3077                        else 100
3078                    )
3079                    max_list_len = 250
3080                    if max_size and max_list_len:
3081                        cols.append(
3082                            Column(
3083                                name=col,
3084                                columnType=col_type,
3085                                maximumSize=max_size,
3086                                maximumListLength=max_list_len,
3087                            )
3088                        )
3089                    elif max_size:
3090                        cols.append(
3091                            Column(name=col, columnType=col_type, maximumSize=max_size)
3092                        )
3093                    else:
3094                        cols.append(Column(name=col, columnType=col_type))
3095                else:
3096                    # TODO add warning that the given col was not found and it's max size is set to 100
3097                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3098            schema = Schema(
3099                name=self.tableName, columns=cols, parent=datasetParentProject
3100            )
3101            table = Table(schema, self.tableToLoad)
3102            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3103            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3104            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3105            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3106            return table.schema.id
3107        else:
3108            # For just uploading the tables to synapse using default
3109            # column types.
3110            table = build_table(self.tableName, datasetParentProject, self.tableToLoad)
3111            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3112            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3113            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3114            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3115            return table.schema.id

Method to create a table from a metadata manifest and upload it to synapse

Arguments:

columnTypeDict: dictionary schema for table columns: type, size, etc
specifySchema: to specify a specific schema for the table format

Returns:

table.schema.id: synID of the newly created table

@tracer.start_as_current_span('TableOperations::replaceTable')

def replaceTable(self, specifySchema: bool = True, columnTypeDict: dict = None): View Source

3117    @tracer.start_as_current_span("TableOperations::replaceTable")
3118    def replaceTable(
3119        self,
3120        specifySchema: bool = True,
3121        columnTypeDict: dict = None,
3122    ):
3123        """
3124        Method to replace an existing table on synapse with metadata from a new manifest
3125
3126        Args:
3127            specifySchema: to infer a schema for the table format
3128            columnTypeDict: dictionary schema for table columns: type, size, etc
3129
3130        Returns:
3131           existingTableId: synID of the already existing table that had its metadata replaced
3132        """
3133        datasetEntity = self.synapse_entity_tracker.get(
3134            synapse_id=self.datasetId, syn=self.synStore.syn, download_file=False
3135        )
3136
3137        datasetName = datasetEntity.name
3138        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3139        existing_table, existing_results = self.synStore.get_synapse_table(
3140            self.existingTableId
3141        )
3142        # remove rows
3143        self.synStore.syn.delete(existing_results)
3144        # Data changes such as removing all rows causes the eTag to change.
3145        self.synapse_entity_tracker.remove(synapse_id=self.existingTableId)
3146        # wait for row deletion to finish on synapse before getting empty table
3147        sleep(10)
3148
3149        # removes all current columns
3150        current_table = self.synapse_entity_tracker.get(
3151            synapse_id=self.existingTableId, syn=self.synStore.syn, download_file=False
3152        )
3153
3154        current_columns = self.synStore.syn.getTableColumns(current_table)
3155
3156        for col in current_columns:
3157            current_table.removeColumn(col)
3158
3159        if not self.tableName:
3160            self.tableName = datasetName + "table"
3161
3162        # Process columns according to manifest entries
3163        table_schema_by_cname = self.synStore._get_table_schema_by_cname(columnTypeDict)
3164        datasetParentProject = self.synStore.getDatasetProject(self.datasetId)
3165        if specifySchema:
3166            if columnTypeDict == {}:
3167                logger.error("Did not provide a columnTypeDict.")
3168            # create list of columns:
3169            cols = []
3170
3171            for col in self.tableToLoad.columns:
3172                if col in table_schema_by_cname:
3173                    col_type = table_schema_by_cname[col]["columnType"]
3174                    max_size = (
3175                        table_schema_by_cname[col]["maximumSize"]
3176                        if "maximumSize" in table_schema_by_cname[col].keys()
3177                        else 100
3178                    )
3179                    max_list_len = 250
3180                    if max_size and max_list_len:
3181                        cols.append(
3182                            Column(
3183                                name=col,
3184                                columnType=col_type,
3185                                maximumSize=max_size,
3186                                maximumListLength=max_list_len,
3187                            )
3188                        )
3189                    elif max_size:
3190                        cols.append(
3191                            Column(name=col, columnType=col_type, maximumSize=max_size)
3192                        )
3193                    else:
3194                        cols.append(Column(name=col, columnType=col_type))
3195                else:
3196                    # TODO add warning that the given col was not found and it's max size is set to 100
3197                    cols.append(Column(name=col, columnType="STRING", maximumSize=100))
3198
3199            # adds new columns to schema
3200            for col in cols:
3201                current_table.addColumn(col)
3202
3203            table_result = self.synStore.syn.store(
3204                current_table, isRestricted=self.restrict
3205            )
3206            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3207            # self.synapse_entity_tracker.add(synapse_id=table_result.schema.id, entity=table_result.schema)
3208            self.synapse_entity_tracker.remove(synapse_id=table_result.id)
3209
3210            # wait for synapse store to finish
3211            sleep(1)
3212
3213            # build schema and table from columns and store with necessary restrictions
3214            schema = Schema(
3215                name=self.tableName, columns=cols, parent=datasetParentProject
3216            )
3217            schema.id = self.existingTableId
3218            table = Table(schema, self.tableToLoad, etag=existing_results.etag)
3219            table = self.synStore.syn.store(table, isRestricted=self.restrict)
3220            # Commented out until https://sagebionetworks.jira.com/browse/PLFM-8605 is resolved
3221            # self.synapse_entity_tracker.add(synapse_id=table.schema.id, entity=table.schema)
3222            self.synapse_entity_tracker.remove(synapse_id=table.schema.id)
3223        else:
3224            logging.error("Must specify a schema for table replacements")
3225
3226        # remove system metadata from manifest
3227        existing_table.drop(columns=["ROW_ID", "ROW_VERSION"], inplace=True)
3228        return self.existingTableId

Method to replace an existing table on synapse with metadata from a new manifest

Arguments:

specifySchema: to infer a schema for the table format
columnTypeDict: dictionary schema for table columns: type, size, etc

Returns:

existingTableId: synID of the already existing table that had its metadata replaced

@tracer.start_as_current_span('TableOperations::upsertTable')

def upsertTable( self, dmge: schematic.schemas.data_model_graph.DataModelGraphExplorer): View Source

3274    @tracer.start_as_current_span("TableOperations::upsertTable")
3275    def upsertTable(self, dmge: DataModelGraphExplorer):
3276        """
3277        Method to upsert rows from a new manifest into an existing table on synapse
3278        For upsert functionality to work, primary keys must follow the naming convention of <componenet>_id
3279        `-tm upsert` should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality.
3280        Currently it is required to use -tcn "display label" with table upserts.
3281
3282
3283        Args:
3284            dmge: DataModelGraphExplorer instance
3285
3286        Returns:
3287           existingTableId: synID of the already existing table that had its metadata replaced
3288        """
3289
3290        authtoken = self._get_auth_token()
3291
3292        synapseDB = SynapseDatabase(
3293            auth_token=authtoken,
3294            project_id=self.synStore.getDatasetProject(self.datasetId),
3295            syn=self.synStore.syn,
3296            synapse_entity_tracker=self.synapse_entity_tracker,
3297        )
3298
3299        try:
3300            # Try performing upsert
3301            synapseDB.upsert_table_rows(
3302                table_name=self.tableName, data=self.tableToLoad
3303            )
3304        except SynapseHTTPError as ex:
3305            # If error is raised because Table has old `Uuid` column and not new `Id` column, then handle and re-attempt upload
3306            if "Id is not a valid column name or id" in str(ex):
3307                self._update_table_uuid_column(dmge)
3308                synapseDB.upsert_table_rows(
3309                    table_name=self.tableName, data=self.tableToLoad
3310                )
3311            # Raise if other error
3312            else:
3313                raise ex
3314
3315        return self.existingTableId

Method to upsert rows from a new manifest into an existing table on synapse For upsert functionality to work, primary keys must follow the naming convention of _id -tm upsert should be used for initial table uploads if users intend to upsert into them at a later time; using 'upsert' at creation will generate the metadata necessary for upsert functionality. Currently it is required to use -tcn "display label" with table upserts.

Arguments:

dmge: DataModelGraphExplorer instance

Returns:

existingTableId: synID of the already existing table that had its metadata replaced

@tracer.start_as_current_span('TableOperations::updateTable')

def updateTable(self, update_col: str = 'Id'): View Source

3389    @tracer.start_as_current_span("TableOperations::updateTable")
3390    def updateTable(
3391        self,
3392        update_col: str = "Id",
3393    ):
3394        """
3395        Method to update an existing table with a new column
3396
3397        Args:
3398            updateCol: column to index the old and new tables on
3399
3400        Returns:
3401           existingTableId: synID of the already existing table that had its metadata replaced
3402        """
3403        existing_table, existing_results = self.synStore.get_synapse_table(
3404            self.existingTableId
3405        )
3406
3407        self.tableToLoad = update_df(existing_table, self.tableToLoad, update_col)
3408        # store table with existing etag data and impose restrictions as appropriate
3409        table_result = self.synStore.syn.store(
3410            Table(self.existingTableId, self.tableToLoad, etag=existing_results.etag),
3411            isRestricted=self.restrict,
3412        )
3413        # We cannot store the Table to the `synapse_entity_tracker` because there is
3414        # not `Schema` on the table object. The above `.store()` function call would
3415        # also update the ETag of the entity within Synapse. Remove it from the tracker
3416        # and re-retrieve it later on if needed again.
3417        self.synapse_entity_tracker.remove(synapse_id=table_result.tableId)
3418
3419        return self.existingTableId

Method to update an existing table with a new column

Arguments:

updateCol: column to index the old and new tables on

Returns:

existingTableId: synID of the already existing table that had its metadata replaced

class DatasetFileView: View Source

3422class DatasetFileView:
3423    """Helper class to create temporary dataset file views.
3424    This class can be used in conjunction with a 'with' statement.
3425    This will ensure that the file view is deleted automatically.
3426    See SynapseStorage.getDatasetAnnotationsBatch for example usage.
3427    """
3428
3429    def __init__(
3430        self,
3431        datasetId: str,
3432        synapse: Synapse,
3433        name: str = None,
3434        temporary: bool = True,
3435        parentId: str = None,
3436    ) -> None:
3437        """Create a file view scoped to a dataset folder.
3438
3439        Args:
3440            datasetId (str): Synapse ID for a dataset folder/project.
3441            synapse (Synapse): Used for Synapse requests.
3442            name (str): Name of the file view (temporary or not).
3443            temporary (bool): Whether to delete the file view on exit
3444                of either a 'with' statement or Python entirely.
3445            parentId (str, optional): Synapse ID specifying where to
3446                store the file view. Defaults to datasetId.
3447        """
3448
3449        self.datasetId = datasetId
3450        self.synapse = synapse
3451        self.is_temporary = temporary
3452
3453        if name is None:
3454            self.name = f"schematic annotation file view for {self.datasetId}"
3455
3456        if self.is_temporary:
3457            uid = secrets.token_urlsafe(5)
3458            self.name = f"{self.name} - UID {uid}"
3459
3460        # TODO: Allow a DCC admin to configure a "universal parent"
3461        #       Such as a Synapse project writeable by everyone.
3462        self.parentId = datasetId if parentId is None else parentId
3463
3464        # TODO: Create local sharing setting to hide from everyone else
3465        view_schema = EntityViewSchema(
3466            name=self.name,
3467            parent=self.parentId,
3468            scopes=self.datasetId,
3469            includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER],
3470            addDefaultViewColumns=False,
3471            addAnnotationColumns=True,
3472        )
3473
3474        # TODO: Handle failure due to insufficient permissions by
3475        #       creating a temporary new project to store view
3476        self.view_schema = self.synapse.store(view_schema)
3477
3478        # These are filled in after calling `self.query()`
3479        self.results = None
3480        self.table = None
3481
3482        # Ensure deletion of the file view (last resort)
3483        if self.is_temporary:
3484            atexit.register(self.delete)
3485
3486    def __enter__(self):
3487        """Return file view when entering 'with' statement."""
3488        return self
3489
3490    def __exit__(self, exc_type, exc_value, traceback):
3491        """Delete file view when exiting 'with' statement."""
3492        if self.is_temporary:
3493            self.delete()
3494
3495    def delete(self):
3496        """Delete the file view on Synapse without deleting local table."""
3497        if self.view_schema is not None:
3498            self.synapse.delete(self.view_schema)
3499            self.view_schema = None
3500
3501    def query(self, tidy=True, force=False):
3502        """Retrieve file view as a data frame (raw format sans index)."""
3503        if self.table is None or force:
3504            fileview_id = self.view_schema["id"]
3505            self.results = self.synapse.tableQuery(f"select * from {fileview_id}")
3506            self.table = self.results.asDataFrame(
3507                rowIdAndVersionInIndex=False,
3508                na_values=STR_NA_VALUES_FILTERED,
3509                keep_default_na=False,
3510            )
3511        if tidy:
3512            self.tidy_table()
3513        return self.table
3514
3515    def tidy_table(self):
3516        """Convert raw file view data frame into more usable format."""
3517        assert self.table is not None, "Must call `self.query()` first."
3518        self._fix_default_columns()
3519        self._fix_list_columns()
3520        self._fix_int_columns()
3521        return self.table
3522
3523    def _fix_default_columns(self):
3524        """Rename default columns to match schematic expectations."""
3525
3526        # Drop ROW_VERSION column if present
3527        if "ROW_VERSION" in self.table:
3528            del self.table["ROW_VERSION"]
3529
3530        # Rename id column to entityId and set as data frame index
3531        if "ROW_ID" in self.table:
3532            self.table["entityId"] = "syn" + self.table["ROW_ID"].astype(str)
3533            self.table = self.table.set_index("entityId", drop=False)
3534            del self.table["ROW_ID"]
3535
3536        # Rename ROW_ETAG column to eTag and place at end of data frame
3537        if "ROW_ETAG" in self.table:
3538            row_etags = self.table.pop("ROW_ETAG")
3539
3540            # eTag column may already present if users annotated data without submitting manifest
3541            # we're only concerned with the new values and not the existing ones
3542            if "eTag" in self.table:
3543                del self.table["eTag"]
3544
3545            self.table.insert(len(self.table.columns), "eTag", row_etags)
3546
3547        return self.table
3548
3549    def _get_columns_of_type(self, types):
3550        """Helper function to get list of columns of a given type(s)."""
3551        matching_columns = []
3552        for header in self.results.headers:
3553            if header.columnType in types:
3554                matching_columns.append(header.name)
3555        return matching_columns
3556
3557    def _fix_list_columns(self):
3558        """Fix formatting of list-columns."""
3559        list_types = {"STRING_LIST", "INTEGER_LIST", "BOOLEAN_LIST"}
3560        list_columns = self._get_columns_of_type(list_types)
3561        for col in list_columns:
3562            self.table[col] = self.table[col].apply(lambda x: ", ".join(x))
3563        return self.table
3564
3565    def _fix_int_columns(self):
3566        """Ensure that integer-columns are actually integers."""
3567        int_columns = self._get_columns_of_type({"INTEGER"})
3568        for col in int_columns:
3569            # Coercing to string because NaN is a floating point value
3570            # and cannot exist alongside integers in a column
3571            def to_int_fn(x):
3572                return "" if np.isnan(x) else str(int(x))
3573
3574            self.table[col] = self.table[col].apply(to_int_fn)
3575        return self.table

Helper class to create temporary dataset file views. This class can be used in conjunction with a 'with' statement. This will ensure that the file view is deleted automatically. See SynapseStorage.getDatasetAnnotationsBatch for example usage.

DatasetFileView( datasetId: str, synapse: synapseclient.client.Synapse, name: str = None, temporary: bool = True, parentId: str = None) View Source

3429    def __init__(
3430        self,
3431        datasetId: str,
3432        synapse: Synapse,
3433        name: str = None,
3434        temporary: bool = True,
3435        parentId: str = None,
3436    ) -> None:
3437        """Create a file view scoped to a dataset folder.
3438
3439        Args:
3440            datasetId (str): Synapse ID for a dataset folder/project.
3441            synapse (Synapse): Used for Synapse requests.
3442            name (str): Name of the file view (temporary or not).
3443            temporary (bool): Whether to delete the file view on exit
3444                of either a 'with' statement or Python entirely.
3445            parentId (str, optional): Synapse ID specifying where to
3446                store the file view. Defaults to datasetId.
3447        """
3448
3449        self.datasetId = datasetId
3450        self.synapse = synapse
3451        self.is_temporary = temporary
3452
3453        if name is None:
3454            self.name = f"schematic annotation file view for {self.datasetId}"
3455
3456        if self.is_temporary:
3457            uid = secrets.token_urlsafe(5)
3458            self.name = f"{self.name} - UID {uid}"
3459
3460        # TODO: Allow a DCC admin to configure a "universal parent"
3461        #       Such as a Synapse project writeable by everyone.
3462        self.parentId = datasetId if parentId is None else parentId
3463
3464        # TODO: Create local sharing setting to hide from everyone else
3465        view_schema = EntityViewSchema(
3466            name=self.name,
3467            parent=self.parentId,
3468            scopes=self.datasetId,
3469            includeEntityTypes=[EntityViewType.FILE, EntityViewType.FOLDER],
3470            addDefaultViewColumns=False,
3471            addAnnotationColumns=True,
3472        )
3473
3474        # TODO: Handle failure due to insufficient permissions by
3475        #       creating a temporary new project to store view
3476        self.view_schema = self.synapse.store(view_schema)
3477
3478        # These are filled in after calling `self.query()`
3479        self.results = None
3480        self.table = None
3481
3482        # Ensure deletion of the file view (last resort)
3483        if self.is_temporary:
3484            atexit.register(self.delete)

Create a file view scoped to a dataset folder.

Arguments:

datasetId (str): Synapse ID for a dataset folder/project.
synapse (Synapse): Used for Synapse requests.
name (str): Name of the file view (temporary or not).
temporary (bool): Whether to delete the file view on exit of either a 'with' statement or Python entirely.
parentId (str, optional): Synapse ID specifying where to store the file view. Defaults to datasetId.

datasetId

synapse

is_temporary

parentId

view_schema

results

table

def delete(self): View Source

3495    def delete(self):
3496        """Delete the file view on Synapse without deleting local table."""
3497        if self.view_schema is not None:
3498            self.synapse.delete(self.view_schema)
3499            self.view_schema = None

Delete the file view on Synapse without deleting local table.

def query(self, tidy=True, force=False): View Source

3501    def query(self, tidy=True, force=False):
3502        """Retrieve file view as a data frame (raw format sans index)."""
3503        if self.table is None or force:
3504            fileview_id = self.view_schema["id"]
3505            self.results = self.synapse.tableQuery(f"select * from {fileview_id}")
3506            self.table = self.results.asDataFrame(
3507                rowIdAndVersionInIndex=False,
3508                na_values=STR_NA_VALUES_FILTERED,
3509                keep_default_na=False,
3510            )
3511        if tidy:
3512            self.tidy_table()
3513        return self.table

Retrieve file view as a data frame (raw format sans index).

def tidy_table(self): View Source

3515    def tidy_table(self):
3516        """Convert raw file view data frame into more usable format."""
3517        assert self.table is not None, "Must call `self.query()` first."
3518        self._fix_default_columns()
3519        self._fix_list_columns()
3520        self._fix_int_columns()
3521        return self.table

Convert raw file view data frame into more usable format.