Skip to content

Maf

genie_registry.maf

Attributes

logger = logging.getLogger(__name__) module-attribute

Classes

FileTypeFormat

Source code in genie/example_filetype_format.py
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
class FileTypeFormat(metaclass=ABCMeta):
    _process_kwargs = ["newPath", "databaseSynId"]

    _fileType = "fileType"

    _validation_kwargs: List[str] = []

    def __init__(
        self,
        syn: synapseclient.Synapse,
        center: str,
        genie_config: Optional[dict] = None,
        ancillary_files: Optional[List[List[synapseclient.Entity]]] = None,
    ):
        """A validator helper class for a center's files.

        Args:
            syn (synapseclient.Synapse): a synapseclient.Synapse object
            center (str): The participating center name.
            genie_config (dict): The configurations needed for the GENIE codebase.
                GENIE table type/name to Synapse Id. Defaults to None.
            ancillary_files (List[List[synapseclient.Entity]]): all files downloaded for validation. Defaults to None.
        """
        self.syn = syn
        self.center = center
        self.genie_config = genie_config
        self.ancillary_files = ancillary_files

        # self.pool = multiprocessing.Pool(poolSize)

    def _get_dataframe(self, filePathList):
        """
        This function by defaults assumes the filePathList is length of 1
        and is a tsv file.  Could change depending on file type.

        Args:
            filePathList:  A list of file paths (Max is 2 for the two
                           clinical files)

        Returns:
            df: Pandas dataframe of file
        """
        filePath = filePathList[0]
        df = pd.read_csv(filePath, sep="\t", comment="#")
        return df

    def read_file(self, filePathList):
        """
        Each file is to be read in for validation and processing.
        This is not to be changed in any functions.

        Args:
            filePathList:  A list of file paths (Max is 2 for the two
                           clinical files)

        Returns:
            df: Pandas dataframe of file
        """
        df = self._get_dataframe(filePathList)
        return df

    def _validateFilename(self, filePath):
        """
        Function that changes per file type for validating its filename
        Expects an assertion error.

        Args:
            filePath: Path to file
        """
        # assert True
        raise NotImplementedError

    def validateFilename(self, filePath):
        """
        Validation of file name.  The filename is what maps the file
        to its validation and processing.

        Args:
            filePath: Path to file

        Returns:
            str: file type defined by self._fileType
        """
        self._validateFilename(filePath)
        return self._fileType

    def process_steps(self, df, **kwargs):
        """
        This function is modified for every single file.
        It reformats the file and stores the file into database and Synapse.
        """
        pass

    def preprocess(self, newpath):
        """
        This is for any preprocessing that has to occur to the entity name
        to add to kwargs for processing.  entity name is included in
        the new path

        Args:
            newpath: Path to file
        """
        return dict()

    def process(self, filePath, **kwargs):
        """
        This is the main processing function.

        Args:
            filePath: Path to file
            kwargs: The kwargs are determined by self._process_kwargs

        Returns:
            str: file path of processed file
        """
        preprocess_args = self.preprocess(kwargs.get("newPath"))
        kwargs.update(preprocess_args)
        mykwargs = {}
        for required_parameter in self._process_kwargs:
            assert required_parameter in kwargs.keys(), (
                "%s not in parameter list" % required_parameter
            )
            mykwargs[required_parameter] = kwargs[required_parameter]
        logger.info("PROCESSING %s" % filePath)
        # This is done because the clinical files are being merged into a list
        if self._fileType == "clinical":
            path_or_df = self.read_file(filePath)
        # If file type is vcf or maf file, processing requires a filepath
        elif self._fileType not in ["vcf", "maf", "mafSP", "md"]:
            path_or_df = self.read_file([filePath])
        else:
            path_or_df = filePath
        path = self.process_steps(path_or_df, **mykwargs)
        return path

    def _validate(self, df: pd.DataFrame, **kwargs) -> tuple:
        """
        This is the base validation function.
        By default, no validation occurs.

        Args:
            df (pd.DataFrame): A dataframe of the file
            kwargs: The kwargs are determined by self._validation_kwargs

        Returns:
            tuple: The errors and warnings as a file from validation.
                   Defaults to blank strings
        """
        errors = ""
        warnings = ""
        logger.info("NO VALIDATION for %s files" % self._fileType)
        return errors, warnings

    def _cross_validate(self, df: pd.DataFrame) -> tuple:
        """
        This is the base cross-validation function.
        By default, no cross-validation occurs.

        Args:
            df (pd.DataFrame): A dataframe of the file

        Returns:
            tuple: The errors and warnings as a file from cross-validation.
                   Defaults to blank strings
        """
        errors = ""
        warnings = ""
        logger.info("NO CROSS-VALIDATION for %s files" % self._fileType)
        return errors, warnings

    def validate(self, filePathList, **kwargs) -> ValidationResults:
        """
        This is the main validation function.
        Every file type calls self._validate, which is different.

        Args:
            filePathList: A list of file paths.
            kwargs: The kwargs are determined by self._validation_kwargs

        Returns:
            tuple: The errors and warnings as a file from validation.
        """
        mykwargs = {}
        for required_parameter in self._validation_kwargs:
            assert required_parameter in kwargs.keys(), (
                "%s not in parameter list" % required_parameter
            )
            mykwargs[required_parameter] = kwargs[required_parameter]

        errors = ""

        try:
            df = self.read_file(filePathList)
        except Exception as e:
            errors = (
                f"The file(s) ({filePathList}) cannot be read. Original error: {str(e)}"
            )
            warnings = ""

        if not errors:
            logger.info("VALIDATING %s" % os.path.basename(",".join(filePathList)))
            errors, warnings = self._validate(df, **mykwargs)
            # only cross-validate if validation passes or ancillary files exist
            # Assumes that self.ancillary_files won't be [[]] due to whats returned from
            # extract.get_center_input_files
            if not errors and (
                isinstance(self.ancillary_files, list) and self.ancillary_files
            ):
                logger.info(
                    "CROSS-VALIDATING %s" % os.path.basename(",".join(filePathList))
                )
                errors_cross_validate, warnings_cross_validate = self._cross_validate(
                    df
                )
                errors += errors_cross_validate
                warnings += warnings_cross_validate

        result_cls = ValidationResults(errors=errors, warnings=warnings)
        return result_cls
Functions
__init__(syn, center, genie_config=None, ancillary_files=None)

A validator helper class for a center's files.

PARAMETER DESCRIPTION
syn

a synapseclient.Synapse object

TYPE: Synapse

center

The participating center name.

TYPE: str

genie_config

The configurations needed for the GENIE codebase. GENIE table type/name to Synapse Id. Defaults to None.

TYPE: dict DEFAULT: None

ancillary_files

all files downloaded for validation. Defaults to None.

TYPE: List[List[Entity]] DEFAULT: None

Source code in genie/example_filetype_format.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def __init__(
    self,
    syn: synapseclient.Synapse,
    center: str,
    genie_config: Optional[dict] = None,
    ancillary_files: Optional[List[List[synapseclient.Entity]]] = None,
):
    """A validator helper class for a center's files.

    Args:
        syn (synapseclient.Synapse): a synapseclient.Synapse object
        center (str): The participating center name.
        genie_config (dict): The configurations needed for the GENIE codebase.
            GENIE table type/name to Synapse Id. Defaults to None.
        ancillary_files (List[List[synapseclient.Entity]]): all files downloaded for validation. Defaults to None.
    """
    self.syn = syn
    self.center = center
    self.genie_config = genie_config
    self.ancillary_files = ancillary_files
read_file(filePathList)

Each file is to be read in for validation and processing. This is not to be changed in any functions.

PARAMETER DESCRIPTION
filePathList

A list of file paths (Max is 2 for the two clinical files)

RETURNS DESCRIPTION
df

Pandas dataframe of file

Source code in genie/example_filetype_format.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def read_file(self, filePathList):
    """
    Each file is to be read in for validation and processing.
    This is not to be changed in any functions.

    Args:
        filePathList:  A list of file paths (Max is 2 for the two
                       clinical files)

    Returns:
        df: Pandas dataframe of file
    """
    df = self._get_dataframe(filePathList)
    return df
validateFilename(filePath)

Validation of file name. The filename is what maps the file to its validation and processing.

PARAMETER DESCRIPTION
filePath

Path to file

RETURNS DESCRIPTION
str

file type defined by self._fileType

Source code in genie/example_filetype_format.py
125
126
127
128
129
130
131
132
133
134
135
136
137
def validateFilename(self, filePath):
    """
    Validation of file name.  The filename is what maps the file
    to its validation and processing.

    Args:
        filePath: Path to file

    Returns:
        str: file type defined by self._fileType
    """
    self._validateFilename(filePath)
    return self._fileType
process_steps(df, **kwargs)

This function is modified for every single file. It reformats the file and stores the file into database and Synapse.

Source code in genie/example_filetype_format.py
139
140
141
142
143
144
def process_steps(self, df, **kwargs):
    """
    This function is modified for every single file.
    It reformats the file and stores the file into database and Synapse.
    """
    pass
preprocess(newpath)

This is for any preprocessing that has to occur to the entity name to add to kwargs for processing. entity name is included in the new path

PARAMETER DESCRIPTION
newpath

Path to file

Source code in genie/example_filetype_format.py
146
147
148
149
150
151
152
153
154
155
def preprocess(self, newpath):
    """
    This is for any preprocessing that has to occur to the entity name
    to add to kwargs for processing.  entity name is included in
    the new path

    Args:
        newpath: Path to file
    """
    return dict()
process(filePath, **kwargs)

This is the main processing function.

PARAMETER DESCRIPTION
filePath

Path to file

kwargs

The kwargs are determined by self._process_kwargs

DEFAULT: {}

RETURNS DESCRIPTION
str

file path of processed file

Source code in genie/example_filetype_format.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
def process(self, filePath, **kwargs):
    """
    This is the main processing function.

    Args:
        filePath: Path to file
        kwargs: The kwargs are determined by self._process_kwargs

    Returns:
        str: file path of processed file
    """
    preprocess_args = self.preprocess(kwargs.get("newPath"))
    kwargs.update(preprocess_args)
    mykwargs = {}
    for required_parameter in self._process_kwargs:
        assert required_parameter in kwargs.keys(), (
            "%s not in parameter list" % required_parameter
        )
        mykwargs[required_parameter] = kwargs[required_parameter]
    logger.info("PROCESSING %s" % filePath)
    # This is done because the clinical files are being merged into a list
    if self._fileType == "clinical":
        path_or_df = self.read_file(filePath)
    # If file type is vcf or maf file, processing requires a filepath
    elif self._fileType not in ["vcf", "maf", "mafSP", "md"]:
        path_or_df = self.read_file([filePath])
    else:
        path_or_df = filePath
    path = self.process_steps(path_or_df, **mykwargs)
    return path
validate(filePathList, **kwargs)

This is the main validation function. Every file type calls self._validate, which is different.

PARAMETER DESCRIPTION
filePathList

A list of file paths.

kwargs

The kwargs are determined by self._validation_kwargs

DEFAULT: {}

RETURNS DESCRIPTION
tuple

The errors and warnings as a file from validation.

TYPE: ValidationResults

Source code in genie/example_filetype_format.py
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
def validate(self, filePathList, **kwargs) -> ValidationResults:
    """
    This is the main validation function.
    Every file type calls self._validate, which is different.

    Args:
        filePathList: A list of file paths.
        kwargs: The kwargs are determined by self._validation_kwargs

    Returns:
        tuple: The errors and warnings as a file from validation.
    """
    mykwargs = {}
    for required_parameter in self._validation_kwargs:
        assert required_parameter in kwargs.keys(), (
            "%s not in parameter list" % required_parameter
        )
        mykwargs[required_parameter] = kwargs[required_parameter]

    errors = ""

    try:
        df = self.read_file(filePathList)
    except Exception as e:
        errors = (
            f"The file(s) ({filePathList}) cannot be read. Original error: {str(e)}"
        )
        warnings = ""

    if not errors:
        logger.info("VALIDATING %s" % os.path.basename(",".join(filePathList)))
        errors, warnings = self._validate(df, **mykwargs)
        # only cross-validate if validation passes or ancillary files exist
        # Assumes that self.ancillary_files won't be [[]] due to whats returned from
        # extract.get_center_input_files
        if not errors and (
            isinstance(self.ancillary_files, list) and self.ancillary_files
        ):
            logger.info(
                "CROSS-VALIDATING %s" % os.path.basename(",".join(filePathList))
            )
            errors_cross_validate, warnings_cross_validate = self._cross_validate(
                df
            )
            errors += errors_cross_validate
            warnings += warnings_cross_validate

    result_cls = ValidationResults(errors=errors, warnings=warnings)
    return result_cls

maf

Bases: FileTypeFormat

MAF file format validation / processing

Source code in genie_registry/maf.py
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
class maf(FileTypeFormat):
    """
    MAF file format validation / processing
    """

    _fileType = "maf"

    _process_kwargs = []
    _allele_cols = ["REFERENCE_ALLELE", "TUMOR_SEQ_ALLELE1", "TUMOR_SEQ_ALLELE2"]
    _allowed_comb_alleles = ["A", "T", "C", "G", "N"]
    _allowed_ind_alleles = ["-"]

    def _validateFilename(self, filePath):
        """
        Validates filename.  Should be
        data_mutations_extended_CENTER.txt
        """
        assert os.path.basename(filePath[0]) == "data_mutations_extended_{}.txt".format(
            self.center
        )

    def process_steps(self, df):
        """The processing of maf files is specific to GENIE, so
        not included in this function"""
        logger.info(
            "Please run with `--process mutation` parameter "
            "if you want to reannotate the mutation files"
        )
        return None

    def _validate(self, mutationDF):
        """
        This function validates the mutation file to make sure it
        adheres to the mutation SOP.

        t_depth: This column is conditionally optional.
        1. If this column is missing, the data must include the t_ref_count column. Otherwise, it will cause a validation error.
        2. If this column is present, it must have one of the following:
            - A mix of numeric values and NAs
            - All NAs
            - All numeric values

        There are no other checks on the actual values in this column.

        t_ref_count: This column is conditionally optional.
        1. If this column is missing, the data must include the t_depth column. Otherwise, it will cause a validation error.
        2. If this column is present, it must have one of the following:
            - A mix of numeric values and NAs
            - All NAs
            - All numeric values

        There are no other checks on the actual values in this column.

        t_alt_count: This column is entirely optional.
        1. If this column is present, it must have one of the following:
            - A mix of numeric values and NAs
            - All NAs
            - All numeric values

        There are no other checks on the actual values in this column.

        Args:
            mutationDF: mutation dataframe

        Returns:
            Text with all the errors in the mutation file
        """

        first_header = ["CHROMOSOME", "HUGO_SYMBOL", "TUMOR_SAMPLE_BARCODE"]
        SP = self._fileType == "mafSP"
        if SP:
            correct_column_headers = [
                "CHROMOSOME",
                "START_POSITION",
                "REFERENCE_ALLELE",
                "TUMOR_SAMPLE_BARCODE",
                "TUMOR_SEQ_ALLELE2",
            ]
            # T_REF_COUNT + T_ALT_COUNT = T_DEPTH
        else:
            correct_column_headers = [
                "CHROMOSOME",
                "START_POSITION",
                "REFERENCE_ALLELE",
                "TUMOR_SAMPLE_BARCODE",
                "T_ALT_COUNT",
                "TUMOR_SEQ_ALLELE2",
            ]
            # T_REF_COUNT + T_ALT_COUNT = T_DEPTH
        optional_headers = ["T_REF_COUNT", "N_DEPTH", "N_REF_COUNT", "N_ALT_COUNT"]

        mutationDF.columns = [col.upper() for col in mutationDF.columns]

        # total_error = ""
        total_error = StringIO()
        warning = StringIO()

        # CHECK: Everything in correct_column_headers must be in mutation file
        if not all(
            [
                process_functions.checkColExist(mutationDF, i)
                for i in correct_column_headers
            ]
        ):
            total_error.write(
                "maf: Must at least have these headers: {}. "
                "If you are writing your maf file with R, please make"
                "sure to specify the 'quote=FALSE' parameter.\n".format(
                    ",".join(
                        [
                            i
                            for i in correct_column_headers
                            if i not in mutationDF.columns.values
                        ]
                    )
                )
            )
        else:
            # CHECK: First column must be in the first_header list
            if mutationDF.columns[0] not in first_header:
                total_error.write(
                    "maf: First column header must be "
                    "one of these: {}.\n".format(", ".join(first_header))
                )
            # No duplicated values
            primary_cols = [
                "CHROMOSOME",
                "START_POSITION",
                "REFERENCE_ALLELE",
                "TUMOR_SAMPLE_BARCODE",
                "TUMOR_SEQ_ALLELE2",
            ]
            # Strip white space if string column
            for col in primary_cols:
                if mutationDF[col].dtype == object:
                    mutationDF[col] = mutationDF[col].str.strip()
            duplicated_idx = mutationDF.duplicated(primary_cols)
            # Find samples with duplicated variants
            duplicated_variants = (
                mutationDF["TUMOR_SAMPLE_BARCODE"][duplicated_idx]
                .unique()
                .astype(str)
                .tolist()
            )

            if duplicated_idx.any():
                total_error.write(
                    "maf: Must not have duplicated variants. "
                    "Samples with duplicated variants: "
                    f"{', '.join(duplicated_variants)}\n"
                )

        t_depth_exists = process_functions.checkColExist(mutationDF, "T_DEPTH")
        t_ref_exists = process_functions.checkColExist(mutationDF, "T_REF_COUNT")
        if not t_depth_exists and not t_ref_exists and not SP:
            total_error.write("maf: If missing T_DEPTH, must have T_REF_COUNT!\n")
        numerical_cols = [
            "T_DEPTH",
            "T_ALT_COUNT",
            "T_REF_COUNT",
            "N_DEPTH",
            "N_REF_COUNT",
            "N_ALT_COUNT",
            "START_POSITION",
            "END_POSITION",
        ]
        actual_numerical_cols = []
        for col in numerical_cols:
            col_exists = process_functions.checkColExist(mutationDF, col)
            if col_exists:
                # Attempt to convert column to float
                try:
                    mutationDF[col] = mutationDF[col].astype(float)
                except ValueError:
                    pass
                if mutationDF[col].dtype not in [int, float]:
                    total_error.write(f"maf: {col} must be a numerical column.\n")
                else:
                    actual_numerical_cols.append(col)

        # CHECK: Must have TUMOR_SEQ_ALLELE2
        error, warn = _check_allele_col(mutationDF, "TUMOR_SEQ_ALLELE2")
        total_error.write(error)
        warning.write(warn)

        # CHECK: Mutation file would benefit from columns in optional_headers
        if (
            not all(
                [
                    process_functions.checkColExist(mutationDF, i)
                    for i in optional_headers
                ]
            )
            and not SP
        ):
            warning.write(
                "maf: Does not have the column headers that can give extra "
                "information to the processed maf: {}.\n".format(
                    ", ".join(
                        [
                            i
                            for i in optional_headers
                            if i not in mutationDF.columns.values
                        ]
                    )
                )
            )

        # CHECK: Must have REFERENCE_ALLELE
        error, warn = _check_allele_col(mutationDF, "REFERENCE_ALLELE")
        total_error.write(error)
        warning.write(warn)

        error, warn = validate._validate_chromosome(
            df=mutationDF, col="CHROMOSOME", fileformat="maf", allow_chr=False
        )
        total_error.write(error)
        warning.write(warn)
        # if process_functions.checkColExist(mutationDF, "CHROMOSOME"):
        #     # CHECK: Chromosome column can't have any values that start
        #     # with chr or have any WT values
        #     invalid_values = [
        #         str(i).startswith("chr") or str(i) == "WT"
        #         for i in mutationDF["CHROMOSOME"]
        #     ]
        #     if sum(invalid_values) > 0:
        #         total_error.write(
        #             "maf: CHROMOSOME column cannot have any values that "
        #             "start with 'chr' or any 'WT' values.\n"
        #         )

        error = _check_allele_col_validity(mutationDF)
        total_error.write(error)

        if process_functions.checkColExist(mutationDF, "TUMOR_SAMPLE_BARCODE"):
            error = process_functions.validate_genie_identifier(
                identifiers=mutationDF["TUMOR_SAMPLE_BARCODE"],
                center=self.center,
                filename="maf",
                col="TUMOR_SAMPLE_BARCODE",
            )
            total_error.write(error)

        # only check end position as start position is required col
        if (
            process_functions.checkColExist(mutationDF, "START_POSITION")
            and process_functions.checkColExist(mutationDF, "END_POSITION")
            and set(["START_POSITION", "END_POSITION"]) <= set(actual_numerical_cols)
        ):
            errors, warnings = validate.check_variant_start_and_end_positions(
                input_df=mutationDF,
                start_pos_col="START_POSITION",
                end_pos_col="END_POSITION",
                filename="maf",
            )
            total_error.write(errors)
            warning.write(warnings)

        for allele_col in self._allele_cols:
            if process_functions.checkColExist(mutationDF, allele_col):
                invalid_indices = validate.get_invalid_allele_rows(
                    mutationDF,
                    allele_col,
                    allowed_comb_alleles=self._allowed_comb_alleles,
                    allowed_ind_alleles=self._allowed_ind_alleles,
                    ignore_case=True,
                    allow_na=False,
                )
                errors, warnings = validate.get_allele_validation_message(
                    invalid_indices,
                    invalid_col=allele_col,
                    allowed_comb_alleles=self._allowed_comb_alleles,
                    allowed_ind_alleles=self._allowed_ind_alleles,
                    fileformat=self._fileType,
                )
                total_error.write(errors)
                warning.write(warnings)

        return total_error.getvalue(), warning.getvalue()

    def _cross_validate(self, mutationDF: pd.DataFrame) -> tuple:
        """This function cross-validates the mutation file to make sure it
        adheres to the mutation SOP.

        Args:
            mutationDF (pd.DataFrame): mutation dataframe

        Returns:
            Text with all the errors in the mutation file
        """
        errors = ""
        warnings = ""

        # This section can be removed once we remove the list of lists
        clinical_files = validate.parse_file_info_in_nested_list(
            nested_list=self.ancillary_files, search_str="data_clinical_supp"  # type: ignore[arg-type]
        )
        clinical_file_paths = clinical_files["file_info"]["path"]

        if clinical_files["files"]:
            try:
                clinical_sample_df = process_functions.get_clinical_dataframe(
                    filePathList=clinical_file_paths
                )
                has_file_read_error = False
            except Exception:
                has_file_read_error = True

            if not has_file_read_error:
                if process_functions.checkColExist(clinical_sample_df, "SAMPLE_ID"):
                    errors, warnings = validate.check_values_between_two_df(
                        df1=mutationDF,
                        df1_filename="MAF",
                        df1_id_to_check="TUMOR_SAMPLE_BARCODE",
                        df2=clinical_sample_df,
                        df2_filename="sample clinical",
                        df2_id_to_check="SAMPLE_ID",
                    )
        return errors, warnings

    def _get_dataframe(self, filePathList: List[str]) -> pd.DataFrame:
        """Get mutation dataframe

        1) Starts reading the first line in the file
        2) Skips lines that starts with #
        3) Reads in second line
        4) Checks that first line fields matches second line. Must do this because
        pandas.read_csv will allow for a file to have more column headers than content.
        E.g)  A,B,C,D,E
              1,2
              2,3

        5) We keep the 'NA', 'nan', and 'NaN' as strings in the data because
        these are valid allele values
        then convert the ones in the non-allele columns back to actual NAs

        NOTE: Because allele columns are case-insensitive in maf data, we must
        standardize the case of the columns when checking for the non-allele columns
        to convert the NA strings to NAs

        NOTE: This code allows empty dataframes to pass through
        without errors

        Args:
            filePathList (List[str]): list of filepath(s)

        Raises:
            ValueError: First line fields doesn't match second line fields in file

        Returns:
            pd.DataFrame: mutation data
        """
        with open(filePathList[0], "r") as maf_f:
            firstline = maf_f.readline()
            if firstline.startswith("#"):
                firstline = maf_f.readline()
            secondline = maf_f.readline()

        if len(firstline.split("\t")) != len(secondline.split("\t")):
            raise ValueError(
                "Number of fields in a line do not match the "
                "expected number of columns"
            )

        read_csv_params = {
            "filepath_or_buffer": filePathList[0],
            "sep": "\t",
            "comment": "#",
            "keep_default_na": False,
            "na_values": [
                "-1.#IND",
                "1.#QNAN",
                "1.#IND",
                "-1.#QNAN",
                "#N/A N/A",
                "#N/A",
                "N/A",
                "#NA",
                "NULL",
                "-NaN",
                "-nan",
                "",
            ],
            # This is to check if people write files
            # with R, quote=T
            "quoting": 3,
            # Retain completely blank lines so that
            # validator will cause the file to fail
            "skip_blank_lines": False,
        }

        mutationdf = transform._convert_df_with_mixed_dtypes(read_csv_params)

        mutationdf = transform._convert_values_to_na(
            input_df=mutationdf,
            values_to_replace=["NA", "nan", "NaN"],
            columns_to_convert=[
                col
                for col in mutationdf.columns
                if col.upper() not in self._allele_cols
            ],
        )
        return mutationdf
Functions
process_steps(df)

The processing of maf files is specific to GENIE, so not included in this function

Source code in genie_registry/maf.py
170
171
172
173
174
175
176
177
def process_steps(self, df):
    """The processing of maf files is specific to GENIE, so
    not included in this function"""
    logger.info(
        "Please run with `--process mutation` parameter "
        "if you want to reannotate the mutation files"
    )
    return None

Functions

_check_allele_col_validity(df)

This function checks specific columns in a MAF (Mutation Annotation Format) file for certain conditions.

The following conditions must be met

If the MAF file has all three of these columns

- TUMOR_SEQ_ALLELE1 (TSA1)
- TUMOR_SEQ_ALLELE2 (TSA2)
- REFERENCE_ALLELE (REF)

Then, one of the following must be true

- Every value in TSA1 must be the same as the value in REF
- Every value in TSA1 must be the same as the value in TSA2

Additionally, if the MAF file has at least these two columns

- REFERENCE_ALLELE (REF)
- TUMOR_SEQ_ALLELE2 (TSA2)

Then

NO values in REF can match TSA2

These rules are important because Genome Nexus (GN) uses TSA1 to annotate data when it's not clear which variant to use. So, there can't be a mix of rows where some have TSA1 equal to REF and some have TSA1 equal to TSA2.

Valid Examples
| REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
| ---------------- | ----------------- | ----------------- |
| C                | C                 | A                 |
| T                | T                 | C                 |
| REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
| ---------------- | ----------------- | ----------------- |
| C                | A                 | A                 |
| T                | C                 | C                 |
| REFERENCE_ALLELE | TUMOR_SEQ_ALLELE2 |
| ---------------- | ----------------- |
| C                | A                 |
| T                | C                 |
Invalid Examples
| REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
| ---------------- | ----------------- | ----------------- |
| C                | C                 | A                 |
| C                | A                 | A                 |
| REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
| ---------------- | ----------------- | ----------------- |
| A                | C                 | A                 |
| T                | C                 | T                 |
| REFERENCE_ALLELE | TUMOR_SEQ_ALLELE2 |
| ---------------- | ----------------- |
| C                | C                 |
| T                | C                 |

See this Genome Nexus issue for more background regarding why this validation rule was implemented.

PARAMETER DESCRIPTION
df

input mutation dataframe

TYPE: DataFrame

RETURNS DESCRIPTION
str

the error message

TYPE: str

Source code in genie_registry/maf.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
def _check_allele_col_validity(df: pd.DataFrame) -> str:
    """
    This function checks specific columns in a MAF (Mutation Annotation Format)
    file for certain conditions.

    The following conditions must be met:
        **If the MAF file has all three of these columns**

            - TUMOR_SEQ_ALLELE1 (TSA1)
            - TUMOR_SEQ_ALLELE2 (TSA2)
            - REFERENCE_ALLELE (REF)

        **Then, one of the following must be true**

            - Every value in TSA1 must be the same as the value in REF
            - Every value in TSA1 must be the same as the value in TSA2

        **Additionally, if the MAF file has at least these two columns**

            - REFERENCE_ALLELE (REF)
            - TUMOR_SEQ_ALLELE2 (TSA2)

        **Then**

            NO values in REF can match TSA2

        These rules are important because Genome Nexus (GN) uses `TSA1` to annotate data
        when it's not clear which variant to use. So, there can't be a mix of rows where
        some have `TSA1` equal to `REF` and some have `TSA1` equal to `TSA2`.

    Example: Valid Examples
        ```
        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
        | ---------------- | ----------------- | ----------------- |
        | C                | C                 | A                 |
        | T                | T                 | C                 |
        ```

        ```
        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
        | ---------------- | ----------------- | ----------------- |
        | C                | A                 | A                 |
        | T                | C                 | C                 |
        ```

        ```
        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE2 |
        | ---------------- | ----------------- |
        | C                | A                 |
        | T                | C                 |
        ```


    Example: Invalid Examples
        ```
        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
        | ---------------- | ----------------- | ----------------- |
        | C                | C                 | A                 |
        | C                | A                 | A                 |
        ```

        ```
        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE1 | TUMOR_SEQ_ALLELE2 |
        | ---------------- | ----------------- | ----------------- |
        | A                | C                 | A                 |
        | T                | C                 | T                 |
        ```

        ```
        | REFERENCE_ALLELE | TUMOR_SEQ_ALLELE2 |
        | ---------------- | ----------------- |
        | C                | C                 |
        | T                | C                 |
        ```


    See this [Genome Nexus issue](https://github.com/genome-nexus/annotation-tools/issues/26) for
    more background regarding why this validation rule was implemented.

    Args:
        df: input mutation dataframe

    Returns:
        str: the error message
    """
    tsa2_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE2")
    tsa1_col_exist = process_functions.checkColExist(df, "TUMOR_SEQ_ALLELE1")
    ref_col_exist = process_functions.checkColExist(df, "REFERENCE_ALLELE")
    error = ""
    if tsa2_col_exist and tsa1_col_exist and ref_col_exist:
        tsa1_eq_ref = all(df["TUMOR_SEQ_ALLELE1"] == df["REFERENCE_ALLELE"])
        tsa1_eq_tsa2 = all(df["TUMOR_SEQ_ALLELE1"] == df["TUMOR_SEQ_ALLELE2"])
        if not (tsa1_eq_ref or tsa1_eq_tsa2):
            error = (
                "maf: Contains both "
                "TUMOR_SEQ_ALLELE1 and TUMOR_SEQ_ALLELE2 columns. "
                "All values in TUMOR_SEQ_ALLELE1 must match all values in "
                "REFERENCE_ALLELE or all values in TUMOR_SEQ_ALLELE2.\n"
            )
    if (
        tsa2_col_exist
        and ref_col_exist
        and not df.query("REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2").empty
    ):
        error = (
            f"{error}maf: Contains instances where values in REFERENCE_ALLELE match values in TUMOR_SEQ_ALLELE2. "
            "This is invalid. Please correct.\n"
        )
        row_index = df.query("REFERENCE_ALLELE == TUMOR_SEQ_ALLELE2").index.values
    return error

_check_allele_col(df, col)

Check the Allele column is correctly formatted.

PARAMETER DESCRIPTION
df

mutation dataframe

col

Column header name

RETURNS DESCRIPTION

error, warning

Source code in genie_registry/maf.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
def _check_allele_col(df, col):
    """
    Check the Allele column is correctly formatted.

    Args:
        df: mutation dataframe
        col: Column header name

    Returns:
        error, warning

    """
    col_exist = process_functions.checkColExist(df, col)
    error = ""
    warning = ""
    if col_exist:
        # CHECK: There can't be any null values
        if sum(df[col].isnull()) > 0:
            error = f"maf: {col} can't have any blank or null values.\n"

    return error, warning