Skip to content

Cna

genie_registry.cna

Attributes

logger = logging.getLogger(__name__) module-attribute

Classes

cna

Bases: FileTypeFormat

Source code in genie_registry/cna.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
class cna(FileTypeFormat):
    _fileType = "cna"

    _process_kwargs = ["newPath"]

    _validation_kwargs = ["nosymbol_check"]

    # VALIDATE FILENAME
    def _validateFilename(self, filePath):
        assert os.path.basename(filePath[0]) == "data_CNA_{}.txt".format(self.center)

    def _process(self, cnaDf):
        cnaDf.rename(columns={cnaDf.columns[0]: cnaDf.columns[0].upper()}, inplace=True)
        cnaDf.rename(columns={"HUGO_SYMBOL": "Hugo_Symbol"}, inplace=True)

        index = [
            i for i, col in enumerate(cnaDf.columns) if col.upper() == "ENTREZ_GENE_ID"
        ]
        if len(index) > 0:
            del cnaDf[cnaDf.columns[index][0]]

        bedSynId = self.genie_config["bed"]
        bed = self.syn.tableQuery(
            f"select Hugo_Symbol, ID from {bedSynId} where CENTER = '{self.center}'"
        )
        bedDf = bed.asDataFrame()
        cnaDf["Hugo_Symbol"] = cnaDf["Hugo_Symbol"].apply(
            lambda x: validateSymbol(x, bedDf)
        )
        order = cnaDf.columns
        cnaDf = cnaDf[~cnaDf["Hugo_Symbol"].isnull()]
        # cnaDf = cnaDf.applymap(str)
        duplicatedGenes = pd.DataFrame()
        duplicated_symbols = cnaDf["Hugo_Symbol"][
            cnaDf["Hugo_Symbol"].duplicated()
        ].unique()
        for i in duplicated_symbols:
            dups = cnaDf[cnaDf["Hugo_Symbol"] == i]
            newVal = dups[dups.columns[dups.columns != "Hugo_Symbol"]].apply(
                mergeCNAvalues
            )
            temp = pd.DataFrame(newVal).transpose()
            temp["Hugo_Symbol"] = i
            duplicatedGenes = pd.concat([duplicatedGenes, temp], sort=False)
        cnaDf.drop_duplicates("Hugo_Symbol", keep=False, inplace=True)
        cnaDf = pd.concat([cnaDf, duplicatedGenes], sort=False)
        cnaDf = cnaDf[order]
        return cnaDf

    def process_steps(self, cnaDf, newPath):
        newCNA = self._process(cnaDf)

        centerMafSynId = self.genie_config["centerMaf"]
        if not newCNA.empty:
            cnaText = process_functions.removePandasDfFloat(newCNA)
            # Replace blank with NA's
            cnaText = (
                cnaText.replace("\t\t", "\tNA\t")
                .replace("\t\t", "\tNA\t")
                .replace("\t\n", "\tNA\n")
            )
            with open(newPath, "w") as cnaFile:
                cnaFile.write(cnaText)
            self.syn.store(synapseclient.File(newPath, parent=centerMafSynId))
        return newPath

    def _validate(self, cnvDF, nosymbol_check):
        total_error = ""
        warning = ""
        cnvDF.columns = [col.upper() for col in cnvDF.columns]

        if cnvDF.columns[0] != "HUGO_SYMBOL":
            total_error += "Your cnv file's first column must be Hugo_Symbol\n"
        haveColumn = process_functions.checkColExist(cnvDF, "HUGO_SYMBOL")
        if haveColumn:
            keepSymbols = cnvDF["HUGO_SYMBOL"]
            cnvDF.drop("HUGO_SYMBOL", axis=1, inplace=True)

        # if sum(cnvDF.apply(lambda x: sum(x.isnull()))) > 0:
        #   total_error += "Your cnv file must not have any empty values\n"

        if process_functions.checkColExist(cnvDF, "ENTREZ_GENE_ID"):
            del cnvDF["ENTREZ_GENE_ID"]
        error = process_functions.validate_genie_identifier(
            identifiers=cnvDF.columns, center=self.center, filename="cnv", col="samples"
        )
        total_error += error
        # cnvDF = cnvDF.fillna('')
        allowed_values = [
            "-2.0",
            "-2",
            "-1.5",
            "-1.0",
            "-1",
            "0.0",
            "0",
            "0.5",
            "1.0",
            "1",
            "1.5",
            "2",
            "2.0",
            "nan",
        ]
        if not all(cnvDF.applymap(lambda x: str(x) in allowed_values).all()):
            total_error += (
                "All values must be NA/blank, -2, -1.5, -1, -0.5, "
                "0, 0.5, 1, 1.5, or 2.\n"
            )
        else:
            cnvDF["HUGO_SYMBOL"] = keepSymbols
            if haveColumn and not nosymbol_check:
                bedSynId = self.genie_config["bed"]
                bed = self.syn.tableQuery(
                    f"select Hugo_Symbol, ID from {bedSynId} "
                    f"where CENTER = '{self.center}'"
                )
                bedDf = bed.asDataFrame()
                cnvDF["remapped"] = cnvDF["HUGO_SYMBOL"].apply(
                    lambda x: validateSymbol(x, bedDf)
                )
                cnvDF = cnvDF[~cnvDF["remapped"].isnull()]

                # Do not allow any duplicated genes after symbols
                # have been remapped
                if sum(cnvDF["remapped"].duplicated()) > 0:
                    duplicated = cnvDF["remapped"].duplicated(keep=False)
                    total_error += (
                        "Your CNA file has duplicated Hugo_Symbols "
                        "(After remapping of genes): {} -> {}.\n".format(
                            ",".join(cnvDF["HUGO_SYMBOL"][duplicated]),
                            ",".join(cnvDF["remapped"][duplicated]),
                        )
                    )
        return (total_error, warning)

Functions

validateSymbol(gene, bedDf, returnMappedDf=True)

Validates the gene symbol against the gene symbol in the bed database. Note that gene symbols in the bed database have gone through processing and have been remapped to allowed actual genes if needed.

Two conditions must be met for the gene to be VALID
  1. The gene exists in the bed database table's Hugo_Symbol column

  2. The gene exists in the bed database table's ID column. Under this condition, the gene in the cna file will be REMAPPED temporarily to the bed database table's Hugo_Symbol value for the purpose of validation. The ID column is the original Hugo_Symbol column of the bed files before the Hugo_Symbol column gets mapped to valid possible gene values in the Actual Gene Positions (GRCh37) database table. See the bed fileformat module's remap_symbols function and how it gets used in processing for more info on this.

The validation throws a WARNING if the gene doesn't satisfy either of the above two conditions

PARAMETER DESCRIPTION
gene

Gene name

TYPE: str

bedDf

The bed database table as a pandas dataframe

TYPE: DataFrame

returnMappedDf

Return a mapped gene. Defaults to True

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION
Union[str, float, bool]

Union[str, float, bool]:

Union[str, float, bool]

Returns gene symbol (str if valid, a float("nan") if invalid) if returnMappedDf is True

Union[str, float, bool]

Returns boolean for whether a gene is valid if returnMappedDf is False

Source code in genie_registry/cna.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def validateSymbol(
    gene: str, bedDf: pd.DataFrame, returnMappedDf: bool = True
) -> Union[str, float, bool]:
    """
    Validates the gene symbol against the gene symbol in the bed database.
    Note that gene symbols in the bed database have gone through processing and
    have been remapped to allowed actual genes if needed.

    Two conditions must be met for the gene to be VALID:
        1. The gene exists in the bed database table's Hugo_Symbol column

        2. The gene exists in the bed database table's ID column. Under this condition,
        the gene in the cna file will be REMAPPED temporarily to the bed database
        table's Hugo_Symbol value for the purpose of validation. The ID column is the
        original Hugo_Symbol column of the bed files before the Hugo_Symbol column gets
        mapped to valid possible gene values in the Actual Gene Positions (GRCh37)
        database table. See the bed fileformat module's remap_symbols function and
        how it gets used in processing for more info on this.

    The validation throws a WARNING if the gene doesn't satisfy
    either of the above two conditions

    Args:
        gene: Gene name
        bedDf: The bed database table as a pandas dataframe
        returnMappedDf: Return a mapped gene. Defaults to True

    Returns:
        Union[str, float, bool]:
        Returns gene symbol (str if valid, a float("nan") if invalid) if returnMappedDf is True
        Returns boolean for whether a gene is valid if returnMappedDf is False
    """
    valid = False
    if sum(bedDf["Hugo_Symbol"] == gene) > 0:
        valid = True
    elif sum(bedDf["ID"] == gene) > 0:
        mismatch = bedDf[bedDf["ID"] == gene]
        mismatch.drop_duplicates(inplace=True)
        logger.info(
            "{} will be remapped to {}".format(gene, mismatch["Hugo_Symbol"].values[0])
        )
        gene = mismatch["Hugo_Symbol"].values[0]
    else:
        logger.warning(
            "{} cannot be remapped and will not be released. The symbol "
            "must exist in your seq assay ids (bed files) and must be "
            "mappable to a gene.".format(gene)
        )
        gene = float("nan")
    if returnMappedDf:
        return gene
    else:
        return valid

makeCNARow(row, symbols)

Make CNA Row (Deprecated function)

CNA values are no longer stored in the database

PARAMETER DESCRIPTION
row

one row in the CNA file

symbols

list of Gene symbols

Source code in genie_registry/cna.py
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def makeCNARow(row, symbols):
    """
    Make CNA Row (Deprecated function)

    CNA values are no longer stored in the database

    Args:
        row: one row in the CNA file
        symbols:  list of Gene symbols
    """
    totalrow = "{symbols}\n{values}".format(
        symbols=",".join(symbols), values=",".join(row.astype(str))
    )
    totalrow = totalrow.replace(".0", "")
    return totalrow

mergeCNAvalues(x)

Merge CNA values, make sure if there are two rows that are the same gene, the values are merged

Source code in genie_registry/cna.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def mergeCNAvalues(x):
    """Merge CNA values, make sure if there are two rows that are the
    same gene, the values are merged"""
    # Change into its own series, because sometimes doing an apply
    # will cause there to be a missing index value which will
    # cause dropna() to fail.
    values = pd.Series(x.values)
    values.dropna(inplace=True)
    uniqueValues = set(values.unique())
    if len(uniqueValues) == 1:
        returnVal = x.tolist()[0]
    elif len(uniqueValues) <= 2:
        uniqueValues.discard(0)
        if len(uniqueValues) == 1:
            returnVal = list(uniqueValues)[0]
        else:
            returnVal = float("nan")
    else:
        returnVal = float("nan")
    return returnVal

checkIfOneZero(x)

Source code in genie_registry/cna.py
108
109
def checkIfOneZero(x):
    assert len(set(x.tolist())) == 1, "Can only be one unique value"