Maf

`genie_registry.maf` ¶

Attributes¶

`logger = logging.getLogger(name)` `module-attribute` ¶

Classes¶

`FileTypeFormat` ¶

Source code in genie/example_filetype_format.py

class FileTypeFormat(metaclass=ABCMeta):
    _process_kwargs = ["newPath", "databaseSynId"]

    _fileType = "fileType"

    _validation_kwargs: List[str] = []

    def __init__(
        self,
        syn: synapseclient.Synapse,
        center: str,
        genie_config: Optional[dict] = None,
        ancillary_files: Optional[List[List[synapseclient.Entity]]] = None,
    ):
        """A validator helper class for a center's files.

        Args:
            syn (synapseclient.Synapse): a synapseclient.Synapse object
            center (str): The participating center name.
            genie_config (dict): The configurations needed for the GENIE codebase.
                GENIE table type/name to Synapse Id. Defaults to None.
            ancillary_files (List[List[synapseclient.Entity]]): all files downloaded for validation. Defaults to None.
        """
        self.syn = syn
        self.center = center
        self.genie_config = genie_config
        self.ancillary_files = ancillary_files

        # self.pool = multiprocessing.Pool(poolSize)

    def _get_dataframe(self, filePathList):
        """
        This function by defaults assumes the filePathList is length of 1
        and is a tsv file.  Could change depending on file type.

        Args:
            filePathList:  A list of file paths (Max is 2 for the two
                           clinical files)

        Returns:
            df: Pandas dataframe of file
        """
        filePath = filePathList[0]
        df = pd.read_csv(filePath, sep="\t", comment="#")
        return df

    def read_file(self, filePathList):
        """
        Each file is to be read in for validation and processing.
        This is not to be changed in any functions.

        Args:
            filePathList:  A list of file paths (Max is 2 for the two
                           clinical files)

        Returns:
            df: Pandas dataframe of file
        """
        df = self._get_dataframe(filePathList)
        return df

    def _validateFilename(self, filePath):
        """
        Function that changes per file type for validating its filename
        Expects an assertion error.

        Args:
            filePath: Path to file
        """
        # assert True
        raise NotImplementedError

    def validateFilename(self, filePath):
        """
        Validation of file name.  The filename is what maps the file
        to its validation and processing.

        Args:
            filePath: Path to file

        Returns:
            str: file type defined by self._fileType
        """
        self._validateFilename(filePath)
        return self._fileType

    def process_steps(self, df, **kwargs):
        """
        This function is modified for every single file.
        It reformats the file and stores the file into database and Synapse.
        """
        pass

    def preprocess(self, newpath):
        """
        This is for any preprocessing that has to occur to the entity name
        to add to kwargs for processing.  entity name is included in
        the new path

        Args:
            newpath: Path to file
        """
        return dict()

    def process(self, filePath, **kwargs):
        """
        This is the main processing function.

        Args:
            filePath: Path to file
            kwargs: The kwargs are determined by self._process_kwargs

        Returns:
            str: file path of processed file
        """
        preprocess_args = self.preprocess(kwargs.get("newPath"))
        kwargs.update(preprocess_args)
        mykwargs = {}
        for required_parameter in self._process_kwargs:
            assert required_parameter in kwargs.keys(), (
                "%s not in parameter list" % required_parameter
            )
            mykwargs[required_parameter] = kwargs[required_parameter]
        logger.info("PROCESSING %s" % filePath)
        # This is done because the clinical files are being merged into a list
        if self._fileType == "clinical":
            path_or_df = self.read_file(filePath)
        # If file type is vcf or maf file, processing requires a filepath
        elif self._fileType not in ["vcf", "maf", "mafSP", "md"]:
            path_or_df = self.read_file([filePath])
        else:
            path_or_df = filePath
        path = self.process_steps(path_or_df, **mykwargs)
        return path

    def _validate(self, df: pd.DataFrame, **kwargs) -> tuple:
        """
        This is the base validation function.
        By default, no validation occurs.

        Args:
            df (pd.DataFrame): A dataframe of the file
            kwargs: The kwargs are determined by self._validation_kwargs

        Returns:
            tuple: The errors and warnings as a file from validation.
                   Defaults to blank strings
        """
        errors = ""
        warnings = ""
        logger.info("NO VALIDATION for %s files" % self._fileType)
        return errors, warnings

    def _cross_validate(self, df: pd.DataFrame) -> tuple:
        """
        This is the base cross-validation function.
        By default, no cross-validation occurs.

        Args:
            df (pd.DataFrame): A dataframe of the file

        Returns:
            tuple: The errors and warnings as a file from cross-validation.
                   Defaults to blank strings
        """
        errors = ""
        warnings = ""
        logger.info("NO CROSS-VALIDATION for %s files" % self._fileType)
        return errors, warnings

    def validate(self, filePathList, **kwargs) -> ValidationResults:
        """
        This is the main validation function.
        Every file type calls self._validate, which is different.

        Args:
            filePathList: A list of file paths.
            kwargs: The kwargs are determined by self._validation_kwargs

        Returns:
            tuple: The errors and warnings as a file from validation.
        """
        mykwargs = {}
        for required_parameter in self._validation_kwargs:
            assert required_parameter in kwargs.keys(), (
                "%s not in parameter list" % required_parameter
            )
            mykwargs[required_parameter] = kwargs[required_parameter]

        errors = ""

        try:
            df = self.read_file(filePathList)
        except Exception as e:
            errors = (
                f"The file(s) ({filePathList}) cannot be read. Original error: {str(e)}"
            )
            warnings = ""

        if not errors:
            logger.info("VALIDATING %s" % os.path.basename(",".join(filePathList)))
            errors, warnings = self._validate(df, **mykwargs)
            # only cross-validate if validation passes or ancillary files exist
            # Assumes that self.ancillary_files won't be [[]] due to whats returned from
            # extract.get_center_input_files
            if not errors and (
                isinstance(self.ancillary_files, list) and self.ancillary_files
            ):
                logger.info(
                    "CROSS-VALIDATING %s" % os.path.basename(",".join(filePathList))
                )
                errors_cross_validate, warnings_cross_validate = self._cross_validate(
                    df
                )
                errors += errors_cross_validate
                warnings += warnings_cross_validate

        result_cls = ValidationResults(errors=errors, warnings=warnings)
        return result_cls

Functions¶

`init(syn, center, genie_config=None, ancillary_files=None)` ¶

A validator helper class for a center's files.

PARAMETER	DESCRIPTION
`syn`	a synapseclient.Synapse object TYPE: `Synapse`
`center`	The participating center name. TYPE: `str`
`genie_config`	The configurations needed for the GENIE codebase. GENIE table type/name to Synapse Id. Defaults to None. TYPE: `dict` DEFAULT: `None`
`ancillary_files`	all files downloaded for validation. Defaults to None. TYPE: `List[List[Entity]]` DEFAULT: `None`

Source code in genie/example_filetype_format.py

def __init__(
    self,
    syn: synapseclient.Synapse,
    center: str,
    genie_config: Optional[dict] = None,
    ancillary_files: Optional[List[List[synapseclient.Entity]]] = None,
):
    """A validator helper class for a center's files.

    Args:
        syn (synapseclient.Synapse): a synapseclient.Synapse object
        center (str): The participating center name.
        genie_config (dict): The configurations needed for the GENIE codebase.
            GENIE table type/name to Synapse Id. Defaults to None.
        ancillary_files (List[List[synapseclient.Entity]]): all files downloaded for validation. Defaults to None.
    """
    self.syn = syn
    self.center = center
    self.genie_config = genie_config
    self.ancillary_files = ancillary_files

`read_file(filePathList)` ¶

Each file is to be read in for validation and processing. This is not to be changed in any functions.

PARAMETER	DESCRIPTION
`filePathList`	A list of file paths (Max is 2 for the two clinical files)

RETURNS	DESCRIPTION
`df`	Pandas dataframe of file

Source code in genie/example_filetype_format.py

def read_file(self, filePathList):
    """
    Each file is to be read in for validation and processing.
    This is not to be changed in any functions.

    Args:
        filePathList:  A list of file paths (Max is 2 for the two
                       clinical files)

    Returns:
        df: Pandas dataframe of file
    """
    df = self._get_dataframe(filePathList)
    return df

`validateFilename(filePath)` ¶

Validation of file name. The filename is what maps the file to its validation and processing.

PARAMETER	DESCRIPTION
`filePath`	Path to file

RETURNS	DESCRIPTION
`str`	file type defined by self._fileType

Source code in genie/example_filetype_format.py

def validateFilename(self, filePath):
    """
    Validation of file name.  The filename is what maps the file
    to its validation and processing.

    Args:
        filePath: Path to file

    Returns:
        str: file type defined by self._fileType
    """
    self._validateFilename(filePath)
    return self._fileType

`process_steps(df, **kwargs)` ¶

This function is modified for every single file. It reformats the file and stores the file into database and Synapse.

Source code in genie/example_filetype_format.py

def process_steps(self, df, **kwargs):
    """
    This function is modified for every single file.
    It reformats the file and stores the file into database and Synapse.
    """
    pass

`preprocess(newpath)` ¶

This is for any preprocessing that has to occur to the entity name to add to kwargs for processing. entity name is included in the new path

PARAMETER	DESCRIPTION
`newpath`	Path to file

Source code in genie/example_filetype_format.py

def preprocess(self, newpath):
    """
    This is for any preprocessing that has to occur to the entity name
    to add to kwargs for processing.  entity name is included in
    the new path

    Args:
        newpath: Path to file
    """
    return dict()

`process(filePath, **kwargs)` ¶

This is the main processing function.

PARAMETER	DESCRIPTION
`filePath`	Path to file
`kwargs`	The kwargs are determined by self._process_kwargs DEFAULT: `{}`

RETURNS	DESCRIPTION
`str`	file path of processed file

Source code in genie/example_filetype_format.py

def process(self, filePath, **kwargs):
    """
    This is the main processing function.

    Args:
        filePath: Path to file
        kwargs: The kwargs are determined by self._process_kwargs

    Returns:
        str: file path of processed file
    """
    preprocess_args = self.preprocess(kwargs.get("newPath"))
    kwargs.update(preprocess_args)
    mykwargs = {}
    for required_parameter in self._process_kwargs:
        assert required_parameter in kwargs.keys(), (
            "%s not in parameter list" % required_parameter
        )
        mykwargs[required_parameter] = kwargs[required_parameter]
    logger.info("PROCESSING %s" % filePath)
    # This is done because the clinical files are being merged into a list
    if self._fileType == "clinical":
        path_or_df = self.read_file(filePath)
    # If file type is vcf or maf file, processing requires a filepath
    elif self._fileType not in ["vcf", "maf", "mafSP", "md"]:
        path_or_df = self.read_file([filePath])
    else:
        path_or_df = filePath
    path = self.process_steps(path_or_df, **mykwargs)
    return path

`validate(filePathList, **kwargs)` ¶

This is the main validation function. Every file type calls self._validate, which is different.

PARAMETER	DESCRIPTION
`filePathList`	A list of file paths.
`kwargs`	The kwargs are determined by self._validation_kwargs DEFAULT: `{}`

RETURNS	DESCRIPTION
`tuple`	The errors and warnings as a file from validation. TYPE: `ValidationResults`

Source code in genie/example_filetype_format.py

def validate(self, filePathList, **kwargs) -> ValidationResults:
    """
    This is the main validation function.
    Every file type calls self._validate, which is different.

    Args:
        filePathList: A list of file paths.
        kwargs: The kwargs are determined by self._validation_kwargs

    Returns:
        tuple: The errors and warnings as a file from validation.
    """
    mykwargs = {}
    for required_parameter in self._validation_kwargs:
        assert required_parameter in kwargs.keys(), (
            "%s not in parameter list" % required_parameter
        )
        mykwargs[required_parameter] = kwargs[required_parameter]

    errors = ""

    try:
        df = self.read_file(filePathList)
    except Exception as e:
        errors = (
            f"The file(s) ({filePathList}) cannot be read. Original error: {str(e)}"
        )
        warnings = ""

    if not errors:
        logger.info("VALIDATING %s" % os.path.basename(",".join(filePathList)))
        errors, warnings = self._validate(df, **mykwargs)
        # only cross-validate if validation passes or ancillary files exist
        # Assumes that self.ancillary_files won't be [[]] due to whats returned from
        # extract.get_center_input_files
        if not errors and (
            isinstance(self.ancillary_files, list) and self.ancillary_files
        ):
            logger.info(
                "CROSS-VALIDATING %s" % os.path.basename(",".join(filePathList))
            )
            errors_cross_validate, warnings_cross_validate = self._cross_validate(
                df
            )
            errors += errors_cross_validate
            warnings += warnings_cross_validate

    result_cls = ValidationResults(errors=errors, warnings=warnings)
    return result_cls

`maf` ¶

Bases: FileTypeFormat

MAF file format validation / processing

Source code in genie_registry/maf.py

class maf(FileTypeFormat):
    """
    MAF file format validation / processing
    """

    _fileType = "maf"

    _process_kwargs = []
    _allele_cols = ["REFERENCE_ALLELE", "TUMOR_SEQ_ALLELE1", "TUMOR_SEQ_ALLELE2"]
    _allowed_comb_alleles = ["A", "T", "C", "G", "N"]
    _allowed_ind_alleles = ["-"]

    def _validateFilename(self, filePath):
        """
        Validates filename.  Should be
        data_mutations_extended_CENTER.txt
        """
        assert os.path.basename(filePath[0]) == "data_mutations_extended_{}.txt".format(
            self.center
        )

    def process_steps(self, df):
        """The processing of maf files is specific to GENIE, so
        not included in this function"""
        logger.info(
            "Please run with `--process mutation` parameter "
            "if you want to reannotate the mutation files"
        )
        return None

    def _validate(self, mutationDF):
        """
        This function validates the mutation file to make sure it
        adheres to the mutation SOP.

        t_depth: This column is conditionally optional.
        1. If this column is missing, the data must include the t_ref_count column. Otherwise, it will cause a validation error.
        2. If this column is present, it must have one of the following:
            - A mix of numeric values and NAs
            - All NAs
            - All numeric values

        There are no other checks on the actual values in this column.

        t_ref_count: This column is conditionally optional.
        1. If this column is missing, the data must include the t_depth column. Otherwise, it will cause a validation error.
        2. If this column is present, it must have one of the following:
            - A mix of numeric values and NAs
            - All NAs
            - All numeric values

        There are no other checks on the actual values in this column.

        t_alt_count: This column is entirely optional.
        1. If this column is present, it must have one of the following:
            - A mix of numeric values and NAs
            - All NAs
            - All numeric values

        There are no other checks on the actual values in this column.

        Args:
            mutationDF: mutation dataframe

        Returns:
            Text with all the errors in the mutation file
        """

        first_header = ["CHROMOSOME", "HUGO_SYMBOL", "TUMOR_SAMPLE_BARCODE"]
        SP = self._fileType == "mafSP"
        if SP:
            correct_column_headers = [
                "CHROMOSOME",
                "START_POSITION",
                "REFERENCE_ALLELE",
                "TUMOR_SAMPLE_BARCODE",
                "TUMOR_SEQ_ALLELE2",
            ]
            # T_REF_COUNT + T_ALT_COUNT = T_DEPTH
        else:
            correct_column_headers = [
                "CHROMOSOME",
                "START_POSITION",
                "REFERENCE_ALLELE",
                "TUMOR_SAMPLE_BARCODE",
                "T_ALT_COUNT",
                "TUMOR_SEQ_ALLELE2",
            ]
            # T_REF_COUNT + T_ALT_COUNT = T_DEPTH
        optional_headers = ["T_REF_COUNT", "N_DEPTH", "N_REF_COUNT", "N_ALT_COUNT"]

        mutationDF.columns = [col.upper() for col in mutationDF.columns]

        # total_error = ""
        total_error = StringIO()
        warning = StringIO()

        # CHECK: Everything in correct_column_headers must be in mutation file
        if not all(
            [
                process_functions.checkColExist(mutationDF, i)
                for i in correct_column_headers
            ]
        ):
            total_error.write(
                "maf: Must at least have these headers: {}. "
                "If you are writing your maf file with R, please make"
                "sure to specify the 'quote=FALSE' parameter.\n".format(
                    ",".join(
                        [
                            i
                            for i in correct_column_headers
                            if i not in mutationDF.columns.values
                        ]
                    )
                )
            )
        else:
            # CHECK: First column must be in the first_header list
            if mutationDF.columns[0] not in first_header:
                total_error.write(
                    "maf: First column header must be "
                    "one of these: {}.\n".format(", ".join(first_header))
                )
            # No duplicated values
            primary_cols = [
                "CHROMOSOME",
                "START_POSITION",
                "REFERENCE_ALLELE",
                "TUMOR_SAMPLE_BARCODE",
                "TUMOR_SEQ_ALLELE2",
            ]
            # Strip white space if string column
            for col in primary_cols:
                if mutationDF[col].dtype == object:
                    mutationDF[col] = mutationDF[col].str.strip()
            duplicated_idx = mutationDF.duplicated(primary_cols)
            # Find samples with duplicated variants
            duplicated_variants = (
                mutationDF["TUMOR_SAMPLE_BARCODE"][duplicated_idx]
                .unique()
                .astype(str)
                .tolist()
            )

            if duplicated_idx.any():
                total_error.write(
                    "maf: Must not have duplicated variants. "
                    "Samples with duplicated variants: "
                    f"{', '.join(duplicated_variants)}\n"
                )

        t_depth_exists = process_functions.checkColExist(mutationDF, "T_DEPTH")
        t_ref_exists = process_functions.checkColExist(mutationDF, "T_REF_COUNT")
        if not t_depth_exists and not t_ref_exists and not SP:
            total_error.write("maf: If missing T_DEPTH, must have T_REF_COUNT!\n")
        numerical_cols = [
            "T_DEPTH",
            "T_ALT_COUNT",
            "T_REF_COUNT",
            "N_DEPTH",
            "N_REF_COUNT",
            "N_ALT_COUNT",
            "START_POSITION",
            "END_POSITION",
        ]
        actual_numerical_cols = []
        for col in numerical_cols:
            col_exists = process_functions.checkColExist(mutationDF, col)
            if col_exists:
                # Attempt to convert column to float
                try:
                    mutationDF[col] = mutationDF[col].astype(float)
                except ValueError:
                    pass
                if mutationDF[col].dtype not in [int, float]:
                    total_error.write(f"maf: {col} must be a numerical column.\n")
                else:
                    actual_numerical_cols.append(col)

        # CHECK: Must have TUMOR_SEQ_ALLELE2
        error, warn = _check_allele_col(mutationDF, "TUMOR_SEQ_ALLELE2")
        total_error.write(error)
        warning.write(warn)

        # CHECK: Mutation file would benefit from columns in optional_headers
        if (
            not all(
                [
                    process_functions.checkColExist(mutationDF, i)
                    for i in optional_headers
                ]
            )
            and not SP
        ):
            warning.write(
                "maf: Does not have the column headers that can give extra "
                "information to the processed maf: {}.\n".format(
                    ", ".join(
                        [
                            i
                            for i in optional_headers
                            if i not in mutationDF.columns.values
                        ]
                    )
                )
            )

        # CHECK: Must have REFERENCE_ALLELE
        error, warn = _check_allele_col(mutationDF, "REFERENCE_ALLELE")
        total_error.write(error)
        warning.write(warn)

        error, warn = validate._validate_chromosome(
            df=mutationDF, col="CHROMOSOME", fileformat="maf", allow_chr=False
        )
        total_error.write(error)
        warning.write(warn)
        # if process_functions.checkColExist(mutationDF, "CHROMOSOME"):
        #     # CHECK: Chromosome column can't have any values that start
        #     # with chr or have any WT values
        #     invalid_values = [
        #         str(i).startswith("chr") or str(i) == "WT"
        #         for i in mutationDF["CHROMOSOME"]
        #     ]
        #     if sum(invalid_values) > 0:
        #         total_error.write(
        #             "maf: CHROMOSOME column cannot have any values that "
        #             "start with 'chr' or any 'WT' values.\n"
        #         )

        error = _check_allele_col_validity(mutationDF)
        total_error.write(error)

        if process_functions.checkColExist(mutationDF, "TUMOR_SAMPLE_BARCODE"):
            error = process_functions.validate_genie_identifier(
                identifiers=mutationDF["TUMOR_SAMPLE_BARCODE"],
                center=self.center,
                filename="maf",
                col="TUMOR_SAMPLE_BARCODE",
            )
            total_error.write(error)

        # only check end position as start position is required col
        if (
            process_functions.checkColExist(mutationDF, "START_POSITION")
            and process_functions.checkColExist(mutationDF, "END_POSITION")
            and set(["START_POSITION", "END_POSITION"]) <= set(actual_numerical_cols)
        ):
            errors, warnings = validate.check_variant_start_and_end_positions(
                input_df=mutationDF,
                start_pos_col="START_POSITION",
                end_pos_col="END_POSITION",
                filename="maf",
            )
            total_error.write(errors)
            warning.write(warnings)

        for allele_col in self._allele_cols:
            if process_functions.checkColExist(mutationDF, allele_col):
                invalid_indices = validate.get_invalid_allele_rows(
                    mutationDF,
                    allele_col,
                    allowed_comb_alleles=self._allowed_comb_alleles,
                    allowed_ind_alleles=self._allowed_ind_alleles,
                    ignore_case=True,
                    allow_na=False,
                )
                errors, warnings = validate.get_allele_validation_message(
                    invalid_indices,
                    invalid_col=allele_col,
                    allowed_comb_alleles=self._allowed_comb_alleles,
                    allowed_ind_alleles=self._allowed_ind_alleles,
                    fileformat=self._fileType,
                )
                total_error.write(errors)
                warning.write(warnings)

        return total_error.getvalue(), warning.getvalue()

    def _cross_validate(self, mutationDF: pd.DataFrame) -> tuple:
        """This function cross-validates the mutation file to make sure it
        adheres to the mutation SOP.

        Args:
            mutationDF (pd.DataFrame): mutation dataframe

        Returns:
            Text with all the errors in the mutation file
        """
        errors = ""
        warnings = ""

        # This section can be removed once we remove the list of lists
        clinical_files = validate.parse_file_info_in_nested_list(
            nested_list=self.ancillary_files, search_str="data_clinical_supp"  # type: ignore[arg-type]
        )
        clinical_file_paths = clinical_files["file_info"]["path"]

        if clinical_files["files"]:
            try:
                clinical_sample_df = process_functions.get_clinical_dataframe(
                    filePathList=clinical_file_paths
                )
                has_file_read_error = False
            except Exception:
                has_file_read_error = True

            if not has_file_read_error:
                if process_functions.checkColExist(clinical_sample_df, "SAMPLE_ID"):
                    errors, warnings = validate.check_values_between_two_df(
                        df1=mutationDF,
                        df1_filename="MAF",
                        df1_id_to_check="TUMOR_SAMPLE_BARCODE",
                        df2=clinical_sample_df,
                        df2_filename="sample clinical",
                        df2_id_to_check="SAMPLE_ID",
                    )
        return errors, warnings

    def _get_dataframe(self, filePathList: List[str]) -> pd.DataFrame:
        """Get mutation dataframe

        1) Starts reading the first line in the file
        2) Skips lines that starts with #
        3) Reads in second line
        4) Checks that first line fields matches second line. Must do this because
        pandas.read_csv will allow for a file to have more column headers than content.
        E.g)  A,B,C,D,E
              1,2
              2,3

        5) We keep the 'NA', 'nan', and 'NaN' as strings in the data because
        these are valid allele values
        then convert the ones in the non-allele columns back to actual NAs

        NOTE: Because allele columns are case-insensitive in maf data, we must
        standardize the case of the columns when checking for the non-allele columns
        to convert the NA strings to NAs

        NOTE: This code allows empty dataframes to pass through
        without errors

        Args:
            filePathList (List[str]): list of filepath(s)

        Raises:
            ValueError: First line fields doesn't match second line fields in file

        Returns:
            pd.DataFrame: mutation data
        """
        with open(filePathList[0], "r") as maf_f:
            firstline = maf_f.readline()
            if firstline.startswith("#"):
                firstline = maf_f.readline()
            secondline = maf_f.readline()

        if len(firstline.split("\t")) != len(secondline.split("\t")):
            raise ValueError(
                "Number of fields in a line do not match the "
                "expected number of columns"
            )

        read_csv_params = {
            "filepath_or_buffer": filePathList[0],
            "sep": "\t",
            "comment": "#",
            "keep_default_na": False,
            "na_values": [
                "-1.#IND",
                "1.#QNAN",
                "1.#IND",
                "-1.#QNAN",
                "#N/A N/A",
                "#N/A",
                "N/A",
                "#NA",
                "NULL",
                "-NaN",
                "-nan",
                "",
            ],
            # This is to check if people write files
            # with R, quote=T
            "quoting": 3,
            # Retain completely blank lines so that
            # validator will cause the file to fail
            "skip_blank_lines": False,
        }

        mutationdf = transform._convert_df_with_mixed_dtypes(read_csv_params)

        mutationdf = transform._convert_values_to_na(
            input_df=mutationdf,
            values_to_replace=["NA", "nan", "NaN"],
            columns_to_convert=[
                col
                for col in mutationdf.columns
                if col.upper() not in self._allele_cols
            ],
        )
        return mutationdf

Functions¶

`process_steps(df)` ¶

The processing of maf files is specific to GENIE, so not included in this function

Source code in genie_registry/maf.py

def process_steps(self, df):
    """The processing of maf files is specific to GENIE, so
    not included in this function"""
    logger.info(
        "Please run with `--process mutation` parameter "
        "if you want to reannotate the mutation files"
    )
    return None

Functions¶

`_check_allele_col_validity(df)` ¶

This function checks specific columns in a MAF (Mutation Annotation Format) file for certain conditions.

The following conditions must be met

If the MAF file has all three of these columns

- TUMOR_SEQ_ALLELE1 (TSA1)
- TUMOR_SEQ_ALLELE2 (TSA2)
- REFERENCE_ALLELE (REF)

Then, one of the following must be true

- Every value in TSA1 must be the same as the value in REF
- Every value in TSA1 must be the same as the value in TSA2

Additionally, if the MAF file has at least these two columns

- REFERENCE_ALLELE (REF)
- TUMOR_SEQ_ALLELE2 (TSA2)

Then

NO values in REF can match TSA2

These rules are important because Genome Nexus (GN) uses TSA1 to annotate data when it's not clear which variant to use. So, there can't be a mix of rows where some have TSA1 equal to REF and some have TSA1 equal to TSA2.

Valid Examples

| REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
| ---------------- | ----------------- | ----------------- |
| C                | C                 | A                 |
| T                | T                 | C                 |

| REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
| ---------------- | ----------------- | ----------------- |
| C                | A                 | A                 |
| T                | C                 | C                 |

| REFERENCE_ALLELE | TUMOR_SEQ_ALLELE2 |
| ---------------- | ----------------- |
| C                | A                 |
| T                | C                 |

Invalid Examples

| REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
| ---------------- | ----------------- | ----------------- |
| C                | C                 | A                 |
| C                | A                 | A                 |

| REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
| ---------------- | ----------------- | ----------------- |
| A                | C                 | A                 |
| T                | C                 | T                 |

| REFERENCE_ALLELE | TUMOR_SEQ_ALLELE2 |
| ---------------- | ----------------- |
| C                | C                 |
| T                | C                 |

See this Genome Nexus issue for more background regarding why this validation rule was implemented.

PARAMETER	DESCRIPTION
`df`	input mutation dataframe TYPE: `DataFrame`

RETURNS	DESCRIPTION
`str`	the error message TYPE: `str`

Source code in genie_registry/maf.py

def _check_allele_col_validity(df: pd.DataFrame) -> str:
    """
    This function checks specific columns in a MAF (Mutation Annotation Format)
    file for certain conditions.

    The following conditions must be met:
        **If the MAF file has all three of these columns**

            - TUMOR_SEQ_ALLELE1 (TSA1)
            - TUMOR_SEQ_ALLELE2 (TSA2)
            - REFERENCE_ALLELE (REF)

        **Then, one of the following must be true**

            - Every value in TSA1 must be the same as the value in REF
            - Every value in TSA1 must be the same as the value in TSA2

        **Additionally, if the MAF file has at least these two columns**

            - REFERENCE_ALLELE (REF)
            - TUMOR_SEQ_ALLELE2 (TSA2)

        **Then**

            NO values in REF can match TSA2

        These rules are important because Genome Nexus (GN) uses `TSA1` to annotate data
        when it's not clear which variant to use. So, there can't be a mix of rows where
        some have `TSA1` equal to `REF` and some have `TSA1` equal to `TSA2`.

    Example: Valid Examples
        ```
        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
        | ---------------- | ----------------- | ----------------- |
        | C                | C                 | A                 |
        | T                | T                 | C                 |
        ```

        ```
        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
        | ---------------- | ----------------- | ----------------- |
        | C                | A                 | A                 |
        | T                | C                 | C                 |
        ```

        ```
        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE2 |
        | ---------------- | ----------------- |
        | C                | A                 |
        | T                | C                 |
        ```


    Example: Invalid Examples
        ```
        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
        | ---------------- | ----------------- | ----------------- |
        | C                | C                 | A                 |
        | C                | A                 | A                 |
        ```

        ```
        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
        | ---------------- | ----------------- | ----------------- |
        | A                | C                 | A                 |
        | T                | C                 | T                 |
        ```

        ```
        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE2 |
        | ---------------- | ----------------- |
        | C                | C                 |
        | T                | C                 |
        ```


    See this [Genome Nexus issue](https://github.com/genome-nexus/annotation-tools/issues/26) for
    more background regarding why this validation rule was implemented.

    Args:
        df: input mutation dataframe

    Returns:
        str: the error message
    """
    tsa2_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE2")
    tsa1_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE1")
    ref_col_exist = process_functions.checkColExist(df, "REFERENCE_ALLELE")
    error = ""
    if tsa2_col_exist and tsa1_col_exist and ref_col_exist:
        tsa1_eq_ref = all(df["TUMOR_SEQ_ALLELE1"] == df["REFERENCE_ALLELE"])
        tsa1_eq_tsa2 = all(df["TUMOR_SEQ_ALLELE1"] == df["TUMOR_SEQ_ALLELE2"])
        if not (tsa1_eq_ref or tsa1_eq_tsa2):
            error = (
                "maf: Contains both "
                "TUMOR_SEQ_ALLELE1 and TUMOR_SEQ_ALLELE2 columns. "
                "All values in TUMOR_SEQ_ALLELE1 must match all values in "
                "REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n"
            )
    if (
        tsa2_col_exist
        and ref_col_exist
        and not df.query("REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2").empty
    ):
        error = (
            f"{error}maf: Contains instances where values in REFERENCE_ALLELE match values in TUMOR_SEQ_ALLELE2. "
            "This is invalid. Please correct.\n"
        )
        row_index = df.query("REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2").index.values
    return error

`_check_allele_col(df, col)` ¶

Check the Allele column is correctly formatted.

PARAMETER	DESCRIPTION
`df`	mutation dataframe
`col`	Column header name

RETURNS	DESCRIPTION
	error, warning

Source code in genie_registry/maf.py

def _check_allele_col(df, col):
    """
    Check the Allele column is correctly formatted.

    Args:
        df: mutation dataframe
        col: Column header name

    Returns:
        error, warning

    """
    col_exist = process_functions.checkColExist(df, col)
    error = ""
    warning = ""
    if col_exist:
        # CHECK: There can't be any null values
        if sum(df[col].isnull()) > 0:
            error = f"maf: {col} can't have any blank or null values.\n"

    return error, warning

Maf

genie_registry.maf ¶

Attributes¶

logger = logging.getLogger(__name__) module-attribute ¶

Classes¶

FileTypeFormat ¶

Functions¶

__init__(syn, center, genie_config=None, ancillary_files=None) ¶

read_file(filePathList) ¶

validateFilename(filePath) ¶

process_steps(df, **kwargs) ¶

preprocess(newpath) ¶

process(filePath, **kwargs) ¶

validate(filePathList, **kwargs) ¶

maf ¶

Functions¶

process_steps(df) ¶

Functions¶

_check_allele_col_validity(df) ¶

_check_allele_col(df, col) ¶

`genie_registry.maf` ¶

`logger = logging.getLogger(name)` `module-attribute` ¶

`FileTypeFormat` ¶

`init(syn, center, genie_config=None, ancillary_files=None)` ¶

`read_file(filePathList)` ¶

`validateFilename(filePath)` ¶

`process_steps(df, **kwargs)` ¶

`preprocess(newpath)` ¶

`process(filePath, **kwargs)` ¶

`validate(filePathList, **kwargs)` ¶

`maf` ¶

`process_steps(df)` ¶

`_check_allele_col_validity(df)` ¶

`_check_allele_col(df, col)` ¶