Vcf

`genie_registry.vcf` ¶

Attributes¶

`logger = logging.getLogger(name)` `module-attribute` ¶

Classes¶

`FileTypeFormat` ¶

Source code in genie/example_filetype_format.py

class FileTypeFormat(metaclass=ABCMeta):
    _process_kwargs = ["newPath", "databaseSynId"]

    _fileType = "fileType"

    _validation_kwargs: List[str] = []

    def __init__(
        self,
        syn: synapseclient.Synapse,
        center: str,
        genie_config: Optional[dict] = None,
        ancillary_files: Optional[List[List[synapseclient.Entity]]] = None,
    ):
        """A validator helper class for a center's files.

        Args:
            syn (synapseclient.Synapse): a synapseclient.Synapse object
            center (str): The participating center name.
            genie_config (dict): The configurations needed for the GENIE codebase.
                GENIE table type/name to Synapse Id. Defaults to None.
            ancillary_files (List[List[synapseclient.Entity]]): all files downloaded for validation. Defaults to None.
        """
        self.syn = syn
        self.center = center
        self.genie_config = genie_config
        self.ancillary_files = ancillary_files

        # self.pool = multiprocessing.Pool(poolSize)

    def _get_dataframe(self, filePathList):
        """
        This function by defaults assumes the filePathList is length of 1
        and is a tsv file.  Could change depending on file type.

        Args:
            filePathList:  A list of file paths (Max is 2 for the two
                           clinical files)

        Returns:
            df: Pandas dataframe of file
        """
        filePath = filePathList[0]
        df = pd.read_csv(filePath, sep="\t", comment="#")
        return df

    def read_file(self, filePathList):
        """
        Each file is to be read in for validation and processing.
        This is not to be changed in any functions.

        Args:
            filePathList:  A list of file paths (Max is 2 for the two
                           clinical files)

        Returns:
            df: Pandas dataframe of file
        """
        df = self._get_dataframe(filePathList)
        return df

    def _validateFilename(self, filePath):
        """
        Function that changes per file type for validating its filename
        Expects an assertion error.

        Args:
            filePath: Path to file
        """
        # assert True
        raise NotImplementedError

    def validateFilename(self, filePath):
        """
        Validation of file name.  The filename is what maps the file
        to its validation and processing.

        Args:
            filePath: Path to file

        Returns:
            str: file type defined by self._fileType
        """
        self._validateFilename(filePath)
        return self._fileType

    def process_steps(self, df, **kwargs):
        """
        This function is modified for every single file.
        It reformats the file and stores the file into database and Synapse.
        """
        pass

    def preprocess(self, newpath):
        """
        This is for any preprocessing that has to occur to the entity name
        to add to kwargs for processing.  entity name is included in
        the new path

        Args:
            newpath: Path to file
        """
        return dict()

    def process(self, filePath, **kwargs):
        """
        This is the main processing function.

        Args:
            filePath: Path to file
            kwargs: The kwargs are determined by self._process_kwargs

        Returns:
            str: file path of processed file
        """
        preprocess_args = self.preprocess(kwargs.get("newPath"))
        kwargs.update(preprocess_args)
        mykwargs = {}
        for required_parameter in self._process_kwargs:
            assert required_parameter in kwargs.keys(), (
                "%s not in parameter list" % required_parameter
            )
            mykwargs[required_parameter] = kwargs[required_parameter]
        logger.info("PROCESSING %s" % filePath)
        # This is done because the clinical files are being merged into a list
        if self._fileType == "clinical":
            path_or_df = self.read_file(filePath)
        # If file type is vcf or maf file, processing requires a filepath
        elif self._fileType not in ["vcf", "maf", "mafSP", "md"]:
            path_or_df = self.read_file([filePath])
        else:
            path_or_df = filePath
        path = self.process_steps(path_or_df, **mykwargs)
        return path

    def _validate(self, df: pd.DataFrame, **kwargs) -> tuple:
        """
        This is the base validation function.
        By default, no validation occurs.

        Args:
            df (pd.DataFrame): A dataframe of the file
            kwargs: The kwargs are determined by self._validation_kwargs

        Returns:
            tuple: The errors and warnings as a file from validation.
                   Defaults to blank strings
        """
        errors = ""
        warnings = ""
        logger.info("NO VALIDATION for %s files" % self._fileType)
        return errors, warnings

    def _cross_validate(self, df: pd.DataFrame) -> tuple:
        """
        This is the base cross-validation function.
        By default, no cross-validation occurs.

        Args:
            df (pd.DataFrame): A dataframe of the file

        Returns:
            tuple: The errors and warnings as a file from cross-validation.
                   Defaults to blank strings
        """
        errors = ""
        warnings = ""
        logger.info("NO CROSS-VALIDATION for %s files" % self._fileType)
        return errors, warnings

    def validate(self, filePathList, **kwargs) -> ValidationResults:
        """
        This is the main validation function.
        Every file type calls self._validate, which is different.

        Args:
            filePathList: A list of file paths.
            kwargs: The kwargs are determined by self._validation_kwargs

        Returns:
            tuple: The errors and warnings as a file from validation.
        """
        mykwargs = {}
        for required_parameter in self._validation_kwargs:
            assert required_parameter in kwargs.keys(), (
                "%s not in parameter list" % required_parameter
            )
            mykwargs[required_parameter] = kwargs[required_parameter]

        errors = ""

        try:
            df = self.read_file(filePathList)
        except Exception as e:
            errors = (
                f"The file(s) ({filePathList}) cannot be read. Original error: {str(e)}"
            )
            warnings = ""

        if not errors:
            logger.info("VALIDATING %s" % os.path.basename(",".join(filePathList)))
            errors, warnings = self._validate(df, **mykwargs)
            # only cross-validate if validation passes or ancillary files exist
            # Assumes that self.ancillary_files won't be [[]] due to whats returned from
            # extract.get_center_input_files
            if not errors and (
                isinstance(self.ancillary_files, list) and self.ancillary_files
            ):
                logger.info(
                    "CROSS-VALIDATING %s" % os.path.basename(",".join(filePathList))
                )
                errors_cross_validate, warnings_cross_validate = self._cross_validate(
                    df
                )
                errors += errors_cross_validate
                warnings += warnings_cross_validate

        result_cls = ValidationResults(errors=errors, warnings=warnings)
        return result_cls

Functions¶

`init(syn, center, genie_config=None, ancillary_files=None)` ¶

A validator helper class for a center's files.

PARAMETER	DESCRIPTION
`syn`	a synapseclient.Synapse object TYPE: `Synapse`
`center`	The participating center name. TYPE: `str`
`genie_config`	The configurations needed for the GENIE codebase. GENIE table type/name to Synapse Id. Defaults to None. TYPE: `dict` DEFAULT: `None`
`ancillary_files`	all files downloaded for validation. Defaults to None. TYPE: `List[List[Entity]]` DEFAULT: `None`

Source code in genie/example_filetype_format.py

def __init__(
    self,
    syn: synapseclient.Synapse,
    center: str,
    genie_config: Optional[dict] = None,
    ancillary_files: Optional[List[List[synapseclient.Entity]]] = None,
):
    """A validator helper class for a center's files.

    Args:
        syn (synapseclient.Synapse): a synapseclient.Synapse object
        center (str): The participating center name.
        genie_config (dict): The configurations needed for the GENIE codebase.
            GENIE table type/name to Synapse Id. Defaults to None.
        ancillary_files (List[List[synapseclient.Entity]]): all files downloaded for validation. Defaults to None.
    """
    self.syn = syn
    self.center = center
    self.genie_config = genie_config
    self.ancillary_files = ancillary_files

`read_file(filePathList)` ¶

Each file is to be read in for validation and processing. This is not to be changed in any functions.

PARAMETER	DESCRIPTION
`filePathList`	A list of file paths (Max is 2 for the two clinical files)

RETURNS	DESCRIPTION
`df`	Pandas dataframe of file

Source code in genie/example_filetype_format.py

def read_file(self, filePathList):
    """
    Each file is to be read in for validation and processing.
    This is not to be changed in any functions.

    Args:
        filePathList:  A list of file paths (Max is 2 for the two
                       clinical files)

    Returns:
        df: Pandas dataframe of file
    """
    df = self._get_dataframe(filePathList)
    return df

`validateFilename(filePath)` ¶

Validation of file name. The filename is what maps the file to its validation and processing.

PARAMETER	DESCRIPTION
`filePath`	Path to file

RETURNS	DESCRIPTION
`str`	file type defined by self._fileType

Source code in genie/example_filetype_format.py

def validateFilename(self, filePath):
    """
    Validation of file name.  The filename is what maps the file
    to its validation and processing.

    Args:
        filePath: Path to file

    Returns:
        str: file type defined by self._fileType
    """
    self._validateFilename(filePath)
    return self._fileType

`process_steps(df, **kwargs)` ¶

This function is modified for every single file. It reformats the file and stores the file into database and Synapse.

Source code in genie/example_filetype_format.py

def process_steps(self, df, **kwargs):
    """
    This function is modified for every single file.
    It reformats the file and stores the file into database and Synapse.
    """
    pass

`preprocess(newpath)` ¶

This is for any preprocessing that has to occur to the entity name to add to kwargs for processing. entity name is included in the new path

PARAMETER	DESCRIPTION
`newpath`	Path to file

Source code in genie/example_filetype_format.py

def preprocess(self, newpath):
    """
    This is for any preprocessing that has to occur to the entity name
    to add to kwargs for processing.  entity name is included in
    the new path

    Args:
        newpath: Path to file
    """
    return dict()

`process(filePath, **kwargs)` ¶

This is the main processing function.

PARAMETER	DESCRIPTION
`filePath`	Path to file
`kwargs`	The kwargs are determined by self._process_kwargs DEFAULT: `{}`

RETURNS	DESCRIPTION
`str`	file path of processed file

Source code in genie/example_filetype_format.py

def process(self, filePath, **kwargs):
    """
    This is the main processing function.

    Args:
        filePath: Path to file
        kwargs: The kwargs are determined by self._process_kwargs

    Returns:
        str: file path of processed file
    """
    preprocess_args = self.preprocess(kwargs.get("newPath"))
    kwargs.update(preprocess_args)
    mykwargs = {}
    for required_parameter in self._process_kwargs:
        assert required_parameter in kwargs.keys(), (
            "%s not in parameter list" % required_parameter
        )
        mykwargs[required_parameter] = kwargs[required_parameter]
    logger.info("PROCESSING %s" % filePath)
    # This is done because the clinical files are being merged into a list
    if self._fileType == "clinical":
        path_or_df = self.read_file(filePath)
    # If file type is vcf or maf file, processing requires a filepath
    elif self._fileType not in ["vcf", "maf", "mafSP", "md"]:
        path_or_df = self.read_file([filePath])
    else:
        path_or_df = filePath
    path = self.process_steps(path_or_df, **mykwargs)
    return path

`validate(filePathList, **kwargs)` ¶

This is the main validation function. Every file type calls self._validate, which is different.

PARAMETER	DESCRIPTION
`filePathList`	A list of file paths.
`kwargs`	The kwargs are determined by self._validation_kwargs DEFAULT: `{}`

RETURNS	DESCRIPTION
`tuple`	The errors and warnings as a file from validation. TYPE: `ValidationResults`

Source code in genie/example_filetype_format.py

def validate(self, filePathList, **kwargs) -> ValidationResults:
    """
    This is the main validation function.
    Every file type calls self._validate, which is different.

    Args:
        filePathList: A list of file paths.
        kwargs: The kwargs are determined by self._validation_kwargs

    Returns:
        tuple: The errors and warnings as a file from validation.
    """
    mykwargs = {}
    for required_parameter in self._validation_kwargs:
        assert required_parameter in kwargs.keys(), (
            "%s not in parameter list" % required_parameter
        )
        mykwargs[required_parameter] = kwargs[required_parameter]

    errors = ""

    try:
        df = self.read_file(filePathList)
    except Exception as e:
        errors = (
            f"The file(s) ({filePathList}) cannot be read. Original error: {str(e)}"
        )
        warnings = ""

    if not errors:
        logger.info("VALIDATING %s" % os.path.basename(",".join(filePathList)))
        errors, warnings = self._validate(df, **mykwargs)
        # only cross-validate if validation passes or ancillary files exist
        # Assumes that self.ancillary_files won't be [[]] due to whats returned from
        # extract.get_center_input_files
        if not errors and (
            isinstance(self.ancillary_files, list) and self.ancillary_files
        ):
            logger.info(
                "CROSS-VALIDATING %s" % os.path.basename(",".join(filePathList))
            )
            errors_cross_validate, warnings_cross_validate = self._cross_validate(
                df
            )
            errors += errors_cross_validate
            warnings += warnings_cross_validate

    result_cls = ValidationResults(errors=errors, warnings=warnings)
    return result_cls

`vcf` ¶

Bases: FileTypeFormat

Source code in genie_registry/vcf.py

class vcf(FileTypeFormat):
    _fileType = "vcf"

    _process_kwargs = []
    _allele_cols = ["REF"]
    _allowed_comb_alleles = ["A", "T", "C", "G", "N"]
    _allowed_ind_alleles = []

    def _validateFilename(self, filePath):
        basename = os.path.basename(filePath[0])
        startswith_genie = basename.startswith("GENIE-{}-".format(self.center))
        endswith_vcf = basename.endswith(".vcf")
        assert startswith_genie and endswith_vcf

    def _get_dataframe(self, filePathList: List[str]) -> pd.DataFrame:
        """Get mutation dataframe

        1) Looks for the line in the file starting with #CHROM, that will be
        the header line (columns).

        2) When reading in the data, we keep the 'NA', 'nan', and 'NaN'
        as strings in the data because these are valid allele values
        then convert the ones in the non-allele columns back to actual NAs

        Args:
            filePathList (List[str]): list of filepath(s)

        Raises:
            ValueError: when line with #CHROM doesn't exist in file

        Returns:
            pd.DataFrame: mutation data
        """
        headers = None
        filepath = filePathList[0]
        with open(filepath, "r") as vcffile:
            for row in vcffile:
                if row.startswith("#CHROM"):
                    headers = row.replace("\n", "").replace("\r", "").split("\t")
                    break
        if headers is not None:
            vcfdf = pd.read_csv(
                filepath,
                sep="\t",
                comment="#",
                header=None,
                names=headers,
                keep_default_na=False,
                na_values=[
                    "-1.#IND",
                    "1.#QNAN",
                    "1.#IND",
                    "-1.#QNAN",
                    "#N/A N/A",
                    "#N/A",
                    "N/A",
                    "#NA",
                    "NULL",
                    "-NaN",
                    "-nan",
                    "",
                ],
            )
        else:
            raise ValueError("Your vcf must start with the header #CHROM")

        vcfdf = transform._convert_values_to_na(
            input_df=vcfdf,
            values_to_replace=["NA", "nan", "NaN"],
            columns_to_convert=[
                col for col in vcfdf.columns if col not in self._allele_cols
            ],
        )
        return vcfdf

    def process_steps(self, df):
        """The processing of vcf files is specific to GENIE, so
        not included in this function"""
        logger.info(
            "Please run with `--process mutation` parameter "
            "if you want to reannotate the mutation files"
        )
        return None

    def _validate(self, vcfdf):
        """
        Validates the content of a vcf file

        Args:
            vcfdf: pandas dataframe containing vcf content

        Returns:
            total_error - error messages
            warning - warning messages
        """
        required_headers = pd.Series(
            ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
        )
        total_error = ""
        warning = ""
        if not all(required_headers.isin(vcfdf.columns)):
            total_error += (
                "vcf: Must have these headers: "
                "CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO.\n"
            )
        else:
            # No duplicated values
            primary_cols = ["#CHROM", "POS", "REF", "ALT"]
            if vcfdf.duplicated(primary_cols).any():
                total_error += "vcf: Must not have duplicate variants.\n"

            if vcfdf[["#CHROM", "POS"]].isnull().values.any():
                total_error += (
                    "vcf: May contain rows that are "
                    "space delimited instead of tab delimited.\n"
                )
        # Vcf can only have max of 11 columns
        if len(vcfdf.columns) > 11:
            total_error += (
                "vcf: Should not have more than 11 columns. Only "
                "single sample or matched tumor normal vcf files are accepted.\n"
            )
        elif len(vcfdf.columns) > 8:
            # If there are greater than 8 columns, there must be the FORMAT column
            if "FORMAT" not in vcfdf.columns:
                total_error += "vcf: Must have FORMAT header if sample columns exist.\n"
            # If 11 columns, this is assumed to be a tumor normal vcf
            if len(vcfdf.columns) == 11:
                sample_id = vcfdf.columns[-2]
                normal_id = vcfdf.columns[-1]
                error = process_functions.validate_genie_identifier(
                    identifiers=pd.Series([sample_id]),
                    center=self.center,
                    filename="vcf",
                    col="tumor sample column",
                )
                total_error += error
                error = process_functions.validate_genie_identifier(
                    identifiers=pd.Series([normal_id]),
                    center=self.center,
                    filename="vcf",
                    col="normal sample column",
                )
                total_error += error
            else:
                # Everything else above 8 columns that isn't 11 columns
                # will be assumed to be a single sample vcf.
                # if TUMOR is not the sample column header, then validate
                # the sample column header.
                if "TUMOR" not in vcfdf.columns:
                    sample_id = vcfdf.columns[-1]
                    error = process_functions.validate_genie_identifier(
                        identifiers=pd.Series([sample_id]),
                        center=self.center,
                        filename="vcf",
                        col="tumor sample column",
                    )
                    if error:
                        error = error.replace("\n", "")
                        error += " if vcf represents a single sample and TUMOR is not the sample column header.\n"
                        total_error += error

        # Require that they report variants mapped to
        # either GRCh37 or hg19 without
        # the chr-prefix.
        error, warn = validate._validate_chromosome(
            df=vcfdf, col="#CHROM", fileformat="vcf"
        )
        total_error += error
        warning += warn

        for allele_col in self._allele_cols:
            if process_functions.checkColExist(vcfdf, allele_col):
                invalid_indices = validate.get_invalid_allele_rows(
                    vcfdf,
                    input_col=allele_col,
                    allowed_comb_alleles=self._allowed_comb_alleles,
                    allowed_ind_alleles=self._allowed_ind_alleles,
                    ignore_case=True,
                    allow_na=False,
                )
                errors, warnings = validate.get_allele_validation_message(
                    invalid_indices,
                    invalid_col=allele_col,
                    allowed_comb_alleles=self._allowed_comb_alleles,
                    allowed_ind_alleles=self._allowed_ind_alleles,
                    fileformat=self._fileType,
                )
                total_error += errors
                warning += warnings

        # No white spaces
        white_space = vcfdf.apply(lambda x: contains_whitespace(x), axis=1)
        if sum(white_space) > 0:
            warning += "vcf: Should not have any white spaces in any of the columns.\n"

        # I can also recommend a `bcftools query` command that
        # will parse a VCF in a detailed way,
        # and output with warnings or errors if the format is not adhered too
        return total_error, warning

Functions¶

`process_steps(df)` ¶

The processing of vcf files is specific to GENIE, so not included in this function

Source code in genie_registry/vcf.py

def process_steps(self, df):
    """The processing of vcf files is specific to GENIE, so
    not included in this function"""
    logger.info(
        "Please run with `--process mutation` parameter "
        "if you want to reannotate the mutation files"
    )
    return None

Functions¶

`contains_whitespace(row)` ¶

Gets the total number of whitespaces from each column of a row

Source code in genie_registry/vcf.py

def contains_whitespace(row):
    """Gets the total number of whitespaces from each column of a row"""
    return sum([" " in i for i in row if isinstance(i, str)])

Vcf

genie_registry.vcf ¶

Attributes¶

logger = logging.getLogger(__name__) module-attribute ¶

Classes¶

FileTypeFormat ¶

Functions¶

__init__(syn, center, genie_config=None, ancillary_files=None) ¶

read_file(filePathList) ¶

validateFilename(filePath) ¶

process_steps(df, **kwargs) ¶

preprocess(newpath) ¶

process(filePath, **kwargs) ¶

validate(filePathList, **kwargs) ¶

vcf ¶

Functions¶

process_steps(df) ¶

Functions¶

contains_whitespace(row) ¶

`genie_registry.vcf` ¶

`logger = logging.getLogger(name)` `module-attribute` ¶

`FileTypeFormat` ¶

`init(syn, center, genie_config=None, ancillary_files=None)` ¶

`read_file(filePathList)` ¶

`validateFilename(filePath)` ¶

`process_steps(df, **kwargs)` ¶

`preprocess(newpath)` ¶

`process(filePath, **kwargs)` ¶

`validate(filePathList, **kwargs)` ¶

`vcf` ¶

`process_steps(df)` ¶

`contains_whitespace(row)` ¶