Skip to content

Bed

genie_registry.bed

GENIE bed class and functions

Attributes

LOGGER = logging.getLogger(__name__) module-attribute

Classes

bed

Bases: FileTypeFormat

GENIE bed format

Source code in genie_registry/bed.py
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
class bed(FileTypeFormat):
    """GENIE bed format"""

    _fileType = "bed"

    _process_kwargs = ["newPath", "parentId", "databaseSynId", "seq_assay_id"]

    def _get_dataframe(self, filepathlist):
        """
        Bed files don't have a header

        Args:
            filePathList: List of files
        """
        filepath = filepathlist[0]
        try:
            beddf = pd.read_csv(filepath, sep="\t", header=None)
        except Exception:
            raise ValueError(
                "Can't read in your bed file. "
                "Please make sure the BED file is not binary and "
                "does not contain a comment/header line"
            )
        first_value = str(beddf[0][0])
        if (
            not first_value.isdigit()
            and not first_value.startswith("chr")
            and first_value not in ["X", "Y"]
        ):
            raise ValueError(
                "Please make sure your bed file does not "
                "contain a comment/header line"
            )
        return beddf

    def _validateFilename(self, filepath):
        """
        Validates filename
        CENTER-11.bed

        Args:
            filePath: Path to bedfile
        """

        assert os.path.basename(filepath[0]).startswith(
            "%s-" % self.center
        ) and os.path.basename(filepath[0]).endswith(".bed")

    def create_gene_panel(self, beddf, seq_assay_id, gene_panel_path, parentid):
        """
        Create bed file and gene panel files from the bed file

        Args:
            beddf: bed dataframe
            seq_assay_id: GENIE SEQ_ASSAY_ID
            gene_panel_path: Gene panel folder path
            parentid: Synapse id of gene panel folder

        Returns:
            pd.DataFrame: configured bed dataframe
        """
        LOGGER.info("CREATING GENE PANEL")
        if not beddf.empty:
            exonsdf = beddf[beddf["Feature_Type"] == "exon"]
            # Only include genes that should be included in the panels
            include_exonsdf = exonsdf[exonsdf["includeInPanel"]]
            # Write gene panel
            null_genes = include_exonsdf["Hugo_Symbol"].isnull()
            unique_genes = set(include_exonsdf["Hugo_Symbol"][~null_genes])
            gene_panel_text = (
                "stable_id: {seq_assay_id}\n"
                "description: {seq_assay_id}, "
                "Number of Genes - {num_genes}\n"
                "gene_list:\t{genelist}".format(
                    seq_assay_id=seq_assay_id,
                    num_genes=len(unique_genes),
                    genelist="\t".join(unique_genes),
                )
            )
            gene_panel_name = "data_gene_panel_" + seq_assay_id + ".txt"

            with open(os.path.join(gene_panel_path, gene_panel_name), "w+") as f:
                f.write(gene_panel_text)

            annotations = {
                "fileFormat": "bed",
                "dataSubType": "metadata",
                "cBioFileFormat": "genePanel",
                "center": self.center,
                "species": "Human",
                "consortium": "GENIE",
                "dataType": "genomicVariants",
                "fundingAgency": "AACR",
                "assay": "targetGeneSeq",
                "fileStage": "staging",
            }
            load.store_file(
                syn=self.syn,
                filepath=os.path.join(gene_panel_path, gene_panel_name),
                parentid=parentid,
                annotations=annotations,
            )

    def _process(self, beddf, seq_assay_id, newpath, parentid, create_panel=True):
        """
        Process bed file, add feature type

        Args:
            gene: bed dataframe
            seq_assay_id: GENIE SEQ_ASSAY_ID
            newpath: new GENIE path
            parentid: Synapse id to store the gene panel
            create_panel: Create gene panel

        Returns:
            pd.DataFrame: Conigured bed dataframe
        """
        seq_assay_id = seq_assay_id.upper()
        seq_assay_id = seq_assay_id.replace("_", "-")

        # Add in 6th column which is the clinicalReported
        if len(beddf.columns) > 5:
            if all(beddf[5].apply(lambda x: x in [True, False])):
                beddf[5] = beddf[5].astype(bool)
            else:
                beddf[5] = float("nan")
        else:
            beddf[5] = float("nan")
        beddf = beddf[[0, 1, 2, 3, 4, 5]]
        gene_panel_path = os.path.dirname(newpath)
        # Must be .astype(bool) because `1, 0 in [True, False]`
        beddf[4] = beddf[4].astype(bool)

        exon_gtf_path, gene_gtf_path = create_gtf(process_functions.SCRIPT_DIR)
        LOGGER.info("REMAPPING {}".format(seq_assay_id))
        # bedname = seq_assay_id + ".bed"
        beddf.columns = [
            "Chromosome",
            "Start_Position",
            "End_Position",
            "Hugo_Symbol",
            "includeInPanel",
            "clinicalReported",
        ]
        # Validate gene symbols
        # Gene symbols can be split by ; and _ and : and .
        beddf["Hugo_Symbol"] = [
            i.split(";")[0].split("_")[0].split(":")[0].split(".")[0]
            for i in beddf["Hugo_Symbol"]
        ]
        # Replace all chr with blank
        beddf["Chromosome"] = [str(i).replace("chr", "") for i in beddf["Chromosome"]]
        # Change all start and end to int
        beddf["Start_Position"] = beddf["Start_Position"].apply(int)
        beddf["End_Position"] = beddf["End_Position"].apply(int)

        gene_position_table = self.syn.tableQuery("SELECT * FROM syn11806563")
        gene_positiondf = gene_position_table.asDataFrame()
        beddf["ID"] = beddf["Hugo_Symbol"]
        # The apply function of a DataFrame is called twice on the first
        # row (known pandas behavior)
        beddf = beddf.apply(lambda x: remap_symbols(x, gene_positiondf), axis=1)
        beddf["SEQ_ASSAY_ID"] = seq_assay_id
        temp_bed_path = os.path.join(process_functions.SCRIPT_DIR, "temp.bed")
        beddf.to_csv(temp_bed_path, sep="\t", index=False, header=None)
        final_bed = add_feature_type(temp_bed_path, exon_gtf_path, gene_gtf_path)
        final_bed["CENTER"] = self.center
        final_bed["Chromosome"] = final_bed["Chromosome"].astype(str)
        if create_panel:
            self.create_gene_panel(final_bed, seq_assay_id, gene_panel_path, parentid)
        return final_bed

    def preprocess(self, newpath):
        """
        Standardize and grab seq assay id from the bed file path

        Args:
            newpath: bed file path

        Returns:
            dict: GENIE seq assay id
        """
        seq_assay_id = os.path.basename(newpath).replace(".bed", "")
        seq_assay_id = seq_assay_id.upper().replace("_", "-")
        return {"seq_assay_id": seq_assay_id}

    def process_steps(
        self,
        beddf: pd.DataFrame,
        newPath: str,
        parentId: str,
        databaseSynId: str,
        seq_assay_id: str,
    ) -> str:
        """Process bed file, update bed database, write bed file to path

        Args:
            beddf (pd.DataFrame): input bed data
            newPath (str): Path to new bed file
            parentId (str): Synapse id to store gene panel file
            databaseSynId (str): Synapse id of bed database
            seq_assay_id (str): GENIE seq assay id

        Returns:
            str: Path to new bed file
        """
        final_beddf = self._process(
            beddf=beddf, seq_assay_id=seq_assay_id, newpath=newPath, parentid=parentId
        )
        load.update_table(
            syn=self.syn,
            databaseSynId=databaseSynId,
            newData=final_beddf,
            filterBy=seq_assay_id,
            filterByColumn="SEQ_ASSAY_ID",
            toDelete=True,
        )
        final_beddf.to_csv(newPath, sep="\t", index=False)
        return newPath

    def _validate(self, beddf):
        """
        Validate bed file

        Args:
            bed: Bed dataframe

        Returns:
            total_error: all the errors
            warning: all the warnings
        """
        total_error = ""
        warning = ""
        newcols = [
            "Chromosome",
            "Start_Position",
            "End_Position",
            "Hugo_Symbol",
            "includeInPanel",
        ]
        if len(beddf.columns) < len(newcols):
            total_error += (
                "BED file: Must at least have five columns in this "
                "order: {}. Make sure there are "
                "no headers.\n".format(", ".join(newcols))
            )
        else:
            newcols.extend(range(0, len(beddf.columns) - len(newcols)))
            beddf.columns = newcols
            to_validate_symbol = True
            if not all(beddf["Start_Position"].apply(lambda x: isinstance(x, int))):
                total_error += (
                    "BED file: "
                    "The Start_Position column must only be "
                    "integers. Make sure there are no headers.\n"
                )
                to_validate_symbol = False
            if not all(beddf["End_Position"].apply(lambda x: isinstance(x, int))):
                total_error += (
                    "BED file: "
                    "The End_Position column must only be "
                    "integers. Make sure there are no headers.\n"
                )
                to_validate_symbol = False

            LOGGER.info("VALIDATING GENE SYMBOLS")
            if any(beddf["Hugo_Symbol"].isnull()):
                total_error += "BED file: You cannot submit any null symbols.\n"
            beddf = beddf[~beddf["Hugo_Symbol"].isnull()]
            beddf["Hugo_Symbol"] = [
                str(hugo).split(";")[0].split("_")[0].split(":")[0]
                for hugo in beddf["Hugo_Symbol"]
            ]
            if (
                sum(beddf["Hugo_Symbol"] == "+") != 0
                or sum(beddf["Hugo_Symbol"] == "-") != 0
            ):
                total_error += (
                    "BED file: Fourth column must be the "
                    "Hugo_Symbol column, not the strand column\n"
                )

            warn, error = process_functions.check_col_and_values(
                beddf,
                "includeInPanel",
                [True, False],
                filename="BED file",
                required=True,
            )
            warning += warn
            total_error += error

            if to_validate_symbol:
                gene_position_table = self.syn.tableQuery("SELECT * FROM syn11806563")
                gene_positiondf = gene_position_table.asDataFrame()
                # The apply function of a DataFrame is called twice on the first row (known
                # pandas behavior)
                beddf = beddf.apply(lambda x: remap_symbols(x, gene_positiondf), axis=1)

                if any(beddf["Hugo_Symbol"].isnull()):
                    warning += (
                        "BED file: "
                        "Any gene names that can't be "
                        "remapped will be null.\n"
                    )
                if all(beddf["Hugo_Symbol"].isnull()):
                    total_error += (
                        "BED file: "
                        "You have no correct gene symbols. "
                        "Make sure your gene symbol column (4th "
                        "column) is formatted like so: SYMBOL"
                        "(;optionaltext).  Optional text can be "
                        "semi-colon separated.\n"
                    )
            # validate on required chromosome col, chr is allowed
            # as we tackle it later in processing
            error, warn = validate._validate_chromosome(
                df=beddf, col="Chromosome", fileformat="BED file", allow_chr=True
            )
            total_error += error
            warning += warn

        return (total_error, warning)
Functions
create_gene_panel(beddf, seq_assay_id, gene_panel_path, parentid)

Create bed file and gene panel files from the bed file

PARAMETER DESCRIPTION
beddf

bed dataframe

seq_assay_id

GENIE SEQ_ASSAY_ID

gene_panel_path

Gene panel folder path

parentid

Synapse id of gene panel folder

RETURNS DESCRIPTION

pd.DataFrame: configured bed dataframe

Source code in genie_registry/bed.py
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
def create_gene_panel(self, beddf, seq_assay_id, gene_panel_path, parentid):
    """
    Create bed file and gene panel files from the bed file

    Args:
        beddf: bed dataframe
        seq_assay_id: GENIE SEQ_ASSAY_ID
        gene_panel_path: Gene panel folder path
        parentid: Synapse id of gene panel folder

    Returns:
        pd.DataFrame: configured bed dataframe
    """
    LOGGER.info("CREATING GENE PANEL")
    if not beddf.empty:
        exonsdf = beddf[beddf["Feature_Type"] == "exon"]
        # Only include genes that should be included in the panels
        include_exonsdf = exonsdf[exonsdf["includeInPanel"]]
        # Write gene panel
        null_genes = include_exonsdf["Hugo_Symbol"].isnull()
        unique_genes = set(include_exonsdf["Hugo_Symbol"][~null_genes])
        gene_panel_text = (
            "stable_id: {seq_assay_id}\n"
            "description: {seq_assay_id}, "
            "Number of Genes - {num_genes}\n"
            "gene_list:\t{genelist}".format(
                seq_assay_id=seq_assay_id,
                num_genes=len(unique_genes),
                genelist="\t".join(unique_genes),
            )
        )
        gene_panel_name = "data_gene_panel_" + seq_assay_id + ".txt"

        with open(os.path.join(gene_panel_path, gene_panel_name), "w+") as f:
            f.write(gene_panel_text)

        annotations = {
            "fileFormat": "bed",
            "dataSubType": "metadata",
            "cBioFileFormat": "genePanel",
            "center": self.center,
            "species": "Human",
            "consortium": "GENIE",
            "dataType": "genomicVariants",
            "fundingAgency": "AACR",
            "assay": "targetGeneSeq",
            "fileStage": "staging",
        }
        load.store_file(
            syn=self.syn,
            filepath=os.path.join(gene_panel_path, gene_panel_name),
            parentid=parentid,
            annotations=annotations,
        )
preprocess(newpath)

Standardize and grab seq assay id from the bed file path

PARAMETER DESCRIPTION
newpath

bed file path

RETURNS DESCRIPTION
dict

GENIE seq assay id

Source code in genie_registry/bed.py
568
569
570
571
572
573
574
575
576
577
578
579
580
def preprocess(self, newpath):
    """
    Standardize and grab seq assay id from the bed file path

    Args:
        newpath: bed file path

    Returns:
        dict: GENIE seq assay id
    """
    seq_assay_id = os.path.basename(newpath).replace(".bed", "")
    seq_assay_id = seq_assay_id.upper().replace("_", "-")
    return {"seq_assay_id": seq_assay_id}
process_steps(beddf, newPath, parentId, databaseSynId, seq_assay_id)

Process bed file, update bed database, write bed file to path

PARAMETER DESCRIPTION
beddf

input bed data

TYPE: DataFrame

newPath

Path to new bed file

TYPE: str

parentId

Synapse id to store gene panel file

TYPE: str

databaseSynId

Synapse id of bed database

TYPE: str

seq_assay_id

GENIE seq assay id

TYPE: str

RETURNS DESCRIPTION
str

Path to new bed file

TYPE: str

Source code in genie_registry/bed.py
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
def process_steps(
    self,
    beddf: pd.DataFrame,
    newPath: str,
    parentId: str,
    databaseSynId: str,
    seq_assay_id: str,
) -> str:
    """Process bed file, update bed database, write bed file to path

    Args:
        beddf (pd.DataFrame): input bed data
        newPath (str): Path to new bed file
        parentId (str): Synapse id to store gene panel file
        databaseSynId (str): Synapse id of bed database
        seq_assay_id (str): GENIE seq assay id

    Returns:
        str: Path to new bed file
    """
    final_beddf = self._process(
        beddf=beddf, seq_assay_id=seq_assay_id, newpath=newPath, parentid=parentId
    )
    load.update_table(
        syn=self.syn,
        databaseSynId=databaseSynId,
        newData=final_beddf,
        filterBy=seq_assay_id,
        filterByColumn="SEQ_ASSAY_ID",
        toDelete=True,
    )
    final_beddf.to_csv(newPath, sep="\t", index=False)
    return newPath

Functions

create_gtf(dirname)

Create exon.gtf and gene.gtf from GRCh37 gtf

PARAMETER DESCRIPTION
dirname

Directory where these files should live

RETURNS DESCRIPTION
exon_gtf_path

exon GTF

gene_gtf_path

gene GTF

Source code in genie_registry/bed.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def create_gtf(dirname):
    """
    Create exon.gtf and gene.gtf from GRCh37 gtf

    Args:
        dirname: Directory where these files should live

    Returns:
        exon_gtf_path: exon GTF
        gene_gtf_path: gene GTF
    """
    exon_gtf_path = os.path.join(dirname, "exon.gtf")
    gene_gtf_path = os.path.join(dirname, "gene.gtf")

    if not os.path.exists(exon_gtf_path) or not os.path.exists(gene_gtf_path):
        download_cmd = [
            "wget",
            "http://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz",
            "-P",
            dirname,
        ]
        subprocess.check_call(download_cmd)
        gtfgz_path = os.path.join(dirname, "Homo_sapiens.GRCh37.75.gtf.gz")
        gunzip_cmd = ["gunzip", "-f", gtfgz_path]
        subprocess.check_call(gunzip_cmd)
        gtf_path = os.path.join(dirname, "Homo_sapiens.GRCh37.75.gtf")

        exon_awk_cmd = ["awk", '$3 == "exon" {print}', gtf_path]
        exon_gtf = subprocess.check_output(exon_awk_cmd, universal_newlines=True)
        with open(exon_gtf_path, "w") as gtf_file:
            gtf_file.write(exon_gtf)
        gene_awk_cmd = ["awk", '$3 == "gene" {print}', gtf_path]
        gene_gtf = subprocess.check_output(gene_awk_cmd, universal_newlines=True)
        with open(gene_gtf_path, "w") as gtf_file:
            gtf_file.write(gene_gtf)
    return (exon_gtf_path, gene_gtf_path)

_add_feature_type_tobeddf(filepath, featuretype)

Add Feature_Type to dataframe

PARAMETER DESCRIPTION
filepath

path to bed

featuretype

exon, intron, or intergenic

RETURNS DESCRIPTION
df

empty dataframe or dataframe with appended feature type

Source code in genie_registry/bed.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
def _add_feature_type_tobeddf(filepath, featuretype):
    """
    Add Feature_Type to dataframe

    Args:
        filepath: path to bed
        featuretype: exon, intron, or intergenic

    Returns:
        df: empty dataframe or dataframe with appended feature type
    """
    bed_columns = [
        "Chromosome",
        "Start_Position",
        "End_Position",
        "Hugo_Symbol",
        "includeInPanel",
        "clinicalReported",
        "ID",
        "SEQ_ASSAY_ID",
        "Feature_Type",
    ]
    # No need to add anything if the dataframe is empty
    if os.stat(filepath).st_size != 0:
        beddf = pd.read_csv(filepath, sep="\t", header=None)
        beddf["Feature_Type"] = featuretype
        beddf.columns = bed_columns
    else:
        beddf = pd.DataFrame(columns=bed_columns)
    return beddf

add_feature_type(temp_bed_path, exon_gtf_path, gene_gtf_path)

Add Feature_Type to bed file (exon, intron, intergenic)

PARAMETER DESCRIPTION
temp_bed_path

BED file without feature type

exon_gtf_path

exon gtf

gene_gtf_path

gene gtf

RETURNS DESCRIPTION
genie_combined_path

Path to final bed file

Source code in genie_registry/bed.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
def add_feature_type(temp_bed_path, exon_gtf_path, gene_gtf_path):
    """
    Add Feature_Type to bed file (exon, intron, intergenic)

    Args:
        temp_bed_path: BED file without feature type
        exon_gtf_path: exon gtf
        gene_gtf_path: gene gtf

    Returns:
        genie_combined_path: Path to final bed file
    """
    genie_exon_path = os.path.join(process_functions.SCRIPT_DIR, "genie_exons.bed")
    genie_intron_path = os.path.join(process_functions.SCRIPT_DIR, "genie_introns.bed")
    genie_intergenic_path = os.path.join(
        process_functions.SCRIPT_DIR, "genie_intergenic.bed"
    )
    intron_intergenic_path = os.path.join(
        process_functions.SCRIPT_DIR, "intron_intergenic.bed"
    )
    gene_path = os.path.join(process_functions.SCRIPT_DIR, "gene.bed")
    # GET EXON REGIONS
    # Get intersection between true exon regions and submitted bed regions
    command = [
        "bedtools",
        "intersect",
        "-a",
        temp_bed_path,
        "-b",
        exon_gtf_path,
        "-wa",
        "|",
        "sort",
        "|",
        "uniq",
        ">",
        genie_exon_path,
    ]
    subprocess.check_call(" ".join(command), shell=True)
    # get intergenic/intron regions
    # Get opposite of intersection between true exon regions and submitted
    # bed regions
    command = [
        "bedtools",
        "intersect",
        "-a",
        temp_bed_path,
        "-b",
        exon_gtf_path,
        "-wa",
        "-v",
        "|",
        "sort",
        "|",
        "uniq",
        ">",
        intron_intergenic_path,
    ]
    subprocess.check_call(" ".join(command), shell=True)
    # get gene regions
    # Get intersection between true gene regions and submitted bed regions
    command = [
        "bedtools",
        "intersect",
        "-a",
        temp_bed_path,
        "-b",
        gene_gtf_path,
        "-wa",
        "|",
        "sort",
        "|",
        "uniq",
        ">",
        gene_path,
    ]
    subprocess.check_call(" ".join(command), shell=True)
    # GET INTRON REGSIONS
    # Difference between the gene regions and exon regions will give
    # intron regions
    command = [
        "diff",
        gene_path,
        genie_exon_path,
        "|",
        "grep",
        "'<'",
        "|",
        "sed",
        "'s/< //'",
        ">",
        genie_intron_path,
    ]
    subprocess.check_call(" ".join(command), shell=True)
    # GET INTERGENIC REGIONS
    # Difference between the intron/intergenic and intron regions will
    # give intergenic regions
    command = [
        "diff",
        intron_intergenic_path,
        genie_intron_path,
        "|",
        "grep",
        "'<'",
        "|",
        "sed",
        "'s/< //'",
        ">",
        genie_intergenic_path,
    ]
    subprocess.check_call(" ".join(command), shell=True)

    genie_exondf = _add_feature_type_tobeddf(genie_exon_path, "exon")
    genie_introndf = _add_feature_type_tobeddf(genie_intron_path, "intron")
    genie_intergenicdf = _add_feature_type_tobeddf(genie_intergenic_path, "intergenic")
    genie_combineddf = pd.concat([genie_exondf, genie_introndf, genie_intergenicdf])
    return genie_combineddf

_check_region_overlap(row, gene_positiondf)

Check if the submitted bed symbol + region overlaps with the actual gene's positions

PARAMETER DESCRIPTION
row

row in bed file (genomic region)

gene_positiondf

Reference gene position dataframe

Return

True if the region does overlap.

Source code in genie_registry/bed.py
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
def _check_region_overlap(row, gene_positiondf):
    """
    Check if the submitted bed symbol + region overlaps with
    the actual gene's positions

    Args:
        row: row in bed file (genomic region)
        gene_positiondf: Reference gene position dataframe

    Return:
        True if the region does overlap.
    """
    matching_symbol_ind = gene_positiondf["hgnc_symbol"] == row["Hugo_Symbol"]
    match_with_genedb = gene_positiondf[matching_symbol_ind]

    if not match_with_genedb.empty:
        # We are assuming that there is only one matching gene, but
        # there are actually duplicated gene symbols
        start_position = match_with_genedb["start_position"].values[0]
        end_position = match_with_genedb["end_position"].values[0]
        # Submitted bed start position in a gene
        start_in_gene = start_position <= row["Start_Position"] <= end_position
        # Submitted bed end position in a gene
        end_in_gene = start_position <= row["End_Position"] <= end_position
        # Submitted bed region surrounds a gene
        gene_in_region = (
            end_position <= row["End_Position"]
            and start_position >= row["Start_Position"]
        )
        return start_in_gene or end_in_gene or gene_in_region
    return False

_get_max_overlap_index(overlap, bed_length, boundary)

Calculate the ratio of overlap between the submitted bed region and gene position dataframe and return the index of the max overlapping region

PARAMETER DESCRIPTION
overlap

Possible overlapping region

bed_length

Length of submitted region

boundary

specified ratio overlap

RETURNS DESCRIPTION

Index of regions with maximum overlap or None

Source code in genie_registry/bed.py
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
def _get_max_overlap_index(overlap, bed_length, boundary):
    """
    Calculate the ratio of overlap between the submitted bed
    region and gene position dataframe and return the index of
    the max overlapping region

    Args:
        overlap: Possible overlapping region
        bed_length: Length of submitted region
        boundary: specified ratio overlap

    Returns:
        Index of regions with maximum overlap or None
    """
    ratio_overlap = overlap / bed_length
    ratio_overlap = ratio_overlap[ratio_overlap > boundary]
    ratio_overlap = ratio_overlap[ratio_overlap <= 1]
    if not ratio_overlap.empty:
        return ratio_overlap.idxmax()
    return None

_map_position_within_boundary(row, positiondf, boundary=0.9)

Map positions and checks if posision is contained within the specified percentage boundary

PARAMETER DESCRIPTION
row

Row in bed file (genomic region)

positiondf

Reference bed position dataframe

boundary

Percent boundary defined

DEFAULT: 0.9

Return

pd.Series: mapped position

Source code in genie_registry/bed.py
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
def _map_position_within_boundary(row, positiondf, boundary=0.9):
    """
    Map positions and checks if posision is contained within the
    specified percentage boundary

    Args:
        row: Row in bed file (genomic region)
        positiondf: Reference bed position dataframe
        boundary: Percent boundary defined

    Return:
        pd.Series: mapped position
    """
    # First filter just based on whether or not region is within the gene
    # position dataframe
    match_chr = positiondf["chromosome_name"] == str(row["Chromosome"])
    chrom_rows = positiondf[match_chr]
    match_start = chrom_rows["start_position"] <= row["Start_Position"]
    start_rows = chrom_rows[match_start]
    end_rows = start_rows[start_rows["end_position"] >= row["End_Position"]]

    bed_length = row["End_Position"] - row["Start_Position"]
    # as long as the strand is within the boundary % defined
    # Start goes over start boundary, but end is contained in position
    if end_rows.empty:
        if sum(chrom_rows["end_position"] >= row["End_Position"]) > 0:
            overlap = row["End_Position"] - chrom_rows["start_position"]
            # difference =  difference * -1.0
            max_overlap = _get_max_overlap_index(overlap, bed_length, boundary)
            if max_overlap is not None:
                end_rows = pd.concat([end_rows, chrom_rows.loc[[max_overlap]]])
        # End goes over end boundary, but start is contained in position
        if sum(chrom_rows["start_position"] <= row["Start_Position"]) > 0:
            overlap = chrom_rows["end_position"] - row["Start_Position"]
            max_overlap = _get_max_overlap_index(overlap, bed_length, boundary)
            if max_overlap is not None:
                end_rows = pd.concat([end_rows, chrom_rows.loc[[max_overlap]]])
        # Start and end go over position boundary
        check = chrom_rows[chrom_rows["start_position"] >= row["Start_Position"]]
        check = check[check["end_position"] <= row["End_Position"]]
        if not check.empty:
            overlap = chrom_rows["end_position"] - chrom_rows["start_position"]
            max_overlap = _get_max_overlap_index(overlap, bed_length, boundary)
            if max_overlap is not None:
                end_rows = pd.concat([end_rows, chrom_rows.loc[[max_overlap]]])
    return end_rows

remap_symbols(row, gene_positiondf)

Remap hugo symbols if there is no overlap between submitted bed region and gene positions.

PARAMETER DESCRIPTION
row

start and end position

gene_positiondf

Actual gene position dataframe

Return

bool or Series: if the gene passed in need to be remapped or the remapped gene

Source code in genie_registry/bed.py
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
def remap_symbols(row, gene_positiondf):
    """
    Remap hugo symbols if there is no overlap between submitted bed region and
    gene positions.

    Args:
        row: start and end position
        gene_positiondf: Actual gene position dataframe

    Return:
        bool or Series: if the gene passed in need to be remapped or
                        the remapped gene
    """
    region_overlap = _check_region_overlap(row, gene_positiondf)
    if not region_overlap:
        overlap_positions = _map_position_within_boundary(row, gene_positiondf)
        if overlap_positions.empty:
            LOGGER.warning(
                "{} cannot be remapped. "
                "These rows will have an empty gene symbol".format(row["Hugo_Symbol"])
            )
            row["Hugo_Symbol"] = float("nan")
        elif len(overlap_positions) > 1:
            symbol_list = overlap_positions["hgnc_symbol"].tolist()
            if row["Hugo_Symbol"] not in symbol_list:
                # if "MLL4", then the HUGO symbol should be KMT2D and KMT2B
                LOGGER.warning(
                    "{} can be mapped to different symbols: {}. "
                    "Please correct or it will be removed.".format(
                        row["Hugo_Symbol"], ", ".join(symbol_list)
                    )
                )
                row["Hugo_Symbol"] = float("nan")
        else:
            symbol = overlap_positions["hgnc_symbol"].values[0]
            if row["Hugo_Symbol"] != symbol:
                LOGGER.info(
                    "{} will be remapped to {}".format(row["Hugo_Symbol"], symbol)
                )
                row["Hugo_Symbol"] = symbol
    return row