Skip to content

create_case_lists

genie.create_case_lists

Creates case lists per cancer type

Attributes

CASE_LIST_TEXT_TEMPLATE = 'cancer_study_identifier: {study_id}\nstable_id: {stable_id}\ncase_list_name: {case_list_name}\ncase_list_description: {case_list_description}\ncase_list_ids: {case_list_ids}' module-attribute

Functions

create_case_lists_map(clinical_file_name)

Creates the case list dictionary

PARAMETER DESCRIPTION
clinical_file_name

clinical file path

RETURNS DESCRIPTION
dict

key = cancer_type value = list of sample ids

dict

key = seq_assay_id value = list of sample ids

list

Clinical samples

Source code in genie/create_case_lists.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def create_case_lists_map(clinical_file_name):
    """
    Creates the case list dictionary

    Args:
        clinical_file_name: clinical file path

    Returns:
        dict: key = cancer_type
              value = list of sample ids
        dict: key = seq_assay_id
              value = list of sample ids
        list: Clinical samples
    """
    with open(clinical_file_name, "rU") as clinical_file:
        seq_assay_map = defaultdict(list)
        clinical_file_map = defaultdict(list)
        clin_samples = []
        reader = csv.DictReader(clinical_file, dialect="excel-tab")
        for row in reader:
            clinical_file_map[row["CANCER_TYPE"]].append(row["SAMPLE_ID"])
            seq_assay_map[row["SEQ_ASSAY_ID"]].append(row["SAMPLE_ID"])
            clin_samples.append(row["SAMPLE_ID"])
    return clinical_file_map, seq_assay_map, clin_samples

_write_single_oncotree_case_list(cancer_type, ids, study_id, output_directory)

Writes one oncotree case list. Python verisons below 3.6 will sort the dictionary keys which causes tests to fail

PARAMETER DESCRIPTION
cancer_type

Oncotree code cancer type

ids

GENIE sample ids

study_id

cBioPortal study id

output_directory

case list output directory

RETURNS DESCRIPTION

case list file path

Source code in genie/create_case_lists.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def _write_single_oncotree_case_list(cancer_type, ids, study_id, output_directory):
    """
    Writes one oncotree case list. Python verisons below
    3.6 will sort the dictionary keys which causes tests to fail

    Args:
        cancer_type: Oncotree code cancer type
        ids: GENIE sample ids
        study_id: cBioPortal study id
        output_directory: case list output directory

    Returns:
        case list file path
    """
    cancer_type = "NA" if cancer_type == "" else cancer_type
    cancer_type_no_spaces = (
        cancer_type.replace(" ", "_").replace(",", "").replace("/", "_")
    )
    cancer_type_no_spaces = (
        "no_oncotree_code" if cancer_type_no_spaces == "NA" else cancer_type_no_spaces
    )
    case_list_text = CASE_LIST_TEXT_TEMPLATE.format(
        study_id=study_id,
        stable_id=study_id + "_" + cancer_type_no_spaces,
        case_list_name="Tumor Type: " + cancer_type,
        case_list_description="All tumors with cancer type " + cancer_type,
        case_list_ids="\t".join(ids),
    )
    case_list_path = os.path.abspath(
        os.path.join(output_directory, "cases_" + cancer_type_no_spaces + ".txt")
    )
    with open(case_list_path, "w") as case_list_file:
        case_list_file.write(case_list_text)
    return case_list_path

write_case_list_files(clinical_file_map, output_directory, study_id)

Writes the cancer_type case list file to case_lists directory

PARAMETER DESCRIPTION
clinical_file_map

cancer type to sample id mapping from create_case_lists_map

output_directory

Directory to write case lists

study_id

cBioPortal study id

RETURNS DESCRIPTION
list

oncotree code case list files

Source code in genie/create_case_lists.py
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def write_case_list_files(clinical_file_map, output_directory, study_id):
    """
    Writes the cancer_type case list file to case_lists directory

    Args:
        clinical_file_map: cancer type to sample id mapping from
                           create_case_lists_map
        output_directory: Directory to write case lists
        study_id: cBioPortal study id

    Returns:
        list: oncotree code case list files
    """
    case_list_files = []
    for cancer_type, ids in clinical_file_map.items():
        case_list_path = _write_single_oncotree_case_list(
            cancer_type, ids, study_id, output_directory
        )
        case_list_files.append(case_list_path)
    return case_list_files

create_sequenced_samples(seq_assay_map, assay_info_file_name)

Gets samples sequenced

PARAMETER DESCRIPTION
seq_assay_map

dictionary containing lists of samples per seq_assay_id

assay_info_file_name

Assay information name

RETURNS DESCRIPTION

lists of cna and sv samples

Source code in genie/create_case_lists.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def create_sequenced_samples(seq_assay_map, assay_info_file_name):
    """
    Gets samples sequenced

    Args:
        seq_assay_map: dictionary containing lists of samples per seq_assay_id
        assay_info_file_name: Assay information name

    Returns:
        lists of cna and sv samples
    """
    with open(assay_info_file_name, "r") as assay_info_file:
        reader = csv.DictReader(assay_info_file, dialect="excel-tab")
        cna_samples = []
        # TODO: Remove when depreciating fusion files
        fusion_samples = []
        sv_samples = []
        for row in reader:
            if "cna" in row["alteration_types"]:
                cna_samples.extend(seq_assay_map[row["SEQ_ASSAY_ID"]])
            if "structural_variants" in row["alteration_types"]:
                fusion_samples.extend(seq_assay_map[row["SEQ_ASSAY_ID"]])
                sv_samples.extend(seq_assay_map[row["SEQ_ASSAY_ID"]])
    return cna_samples, fusion_samples, sv_samples

write_case_list_sequenced(clinical_samples, output_directory, study_id)

Writes the genie sequenced and all samples. Since all samples are sequenced, _all and _sequenced are equal

PARAMETER DESCRIPTION
clinical_samples

List of clinical samples

output_directory

Directory to write case lists

study_id

cBioPortal study id

RETURNS DESCRIPTION
list

case list sequenced and all

Source code in genie/create_case_lists.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def write_case_list_sequenced(clinical_samples, output_directory, study_id):
    """
    Writes the genie sequenced and all samples. Since all samples
    are sequenced, _all and _sequenced are equal

    Args:
        clinical_samples: List of clinical samples
        output_directory: Directory to write case lists
        study_id: cBioPortal study id

    Returns:
        list: case list sequenced and all
    """
    caselist_files = []
    case_list_ids = "\t".join(clinical_samples)
    case_sequenced_path = os.path.abspath(
        os.path.join(output_directory, "cases_sequenced.txt")
    )
    with open(case_sequenced_path, "w") as case_list_sequenced_file:
        case_list_file_text = CASE_LIST_TEXT_TEMPLATE.format(
            study_id=study_id,
            stable_id=study_id + "_sequenced",
            case_list_name="Sequenced Tumors",
            case_list_description="All sequenced samples",
            case_list_ids=case_list_ids,
        )
        case_list_sequenced_file.write(case_list_file_text)
    cases_all_path = os.path.abspath(os.path.join(output_directory, "cases_all.txt"))
    with open(cases_all_path, "w") as case_list_all_file:
        case_list_file_text = CASE_LIST_TEXT_TEMPLATE.format(
            study_id=study_id,
            stable_id=study_id + "_all",
            case_list_name="All samples",
            case_list_description="All samples",
            case_list_ids=case_list_ids,
        )
        case_list_all_file.write(case_list_file_text)
    caselist_files.extend([case_sequenced_path, cases_all_path])
    return caselist_files

write_case_list_cna(cna_samples, output_directory, study_id)

Writes the cna sequenced samples

PARAMETER DESCRIPTION
cna_samples

List of cna samples

output_directory

Directory to write case lists

study_id

cBioPortal study id

RETURNS DESCRIPTION

cna caselist path

Source code in genie/create_case_lists.py
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
def write_case_list_cna(cna_samples, output_directory, study_id):
    """
    Writes the cna sequenced samples

    Args:
        cna_samples: List of cna samples
        output_directory: Directory to write case lists
        study_id: cBioPortal study id

    Returns:
        cna caselist path
    """
    case_list_ids = "\t".join(cna_samples)
    cna_caselist_path = os.path.abspath(os.path.join(output_directory, "cases_cna.txt"))
    with open(cna_caselist_path, "w") as case_list_file:
        case_list_file_text = CASE_LIST_TEXT_TEMPLATE.format(
            study_id=study_id,
            stable_id=study_id + "_cna",
            case_list_name="Samples with CNA",
            case_list_description="Samples with CNA",
            case_list_ids=case_list_ids,
        )
        case_list_file.write(case_list_file_text)
    return cna_caselist_path

write_case_list_sv(samples, output_directory, study_id)

Writes the structural variant (sv) sequenced samples

PARAMETER DESCRIPTION
samples

List of sv samples

TYPE: list

output_directory

Directory to write case lists

TYPE: str

study_id

cBioPortal study id

TYPE: str

RETURNS DESCRIPTION
str

sv caselist path

TYPE: str

Source code in genie/create_case_lists.py
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
def write_case_list_sv(samples: list, output_directory: str, study_id: str) -> str:
    """Writes the structural variant (sv) sequenced samples

    Args:
        samples (list): List of sv samples
        output_directory (str: Directory to write case lists
        study_id (str): cBioPortal study id

    Returns:
        str: sv caselist path
    """
    case_list_ids = "\t".join(samples)
    caselist_path = os.path.abspath(os.path.join(output_directory, "cases_sv.txt"))
    with open(caselist_path, "w") as case_list_file:
        case_list_file_text = CASE_LIST_TEXT_TEMPLATE.format(
            study_id=study_id,
            stable_id=study_id + "_sv",
            case_list_name="Samples with Structural Variants",
            case_list_description="Samples with Structural Variants",
            case_list_ids=case_list_ids,
        )
        case_list_file.write(case_list_file_text)
    return caselist_path

write_case_list_cnaseq(cna_samples, output_directory, study_id)

writes both cna and mutation samples (Just _cna file for now)

PARAMETER DESCRIPTION
cna_samples

List of cna samples

output_directory

Directory to write case lists

study_id

cBioPortal study id

RETURNS DESCRIPTION

cnaseq path

Source code in genie/create_case_lists.py
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
def write_case_list_cnaseq(cna_samples, output_directory, study_id):
    """
    writes both cna and mutation samples (Just _cna file for now)

    Args:
        cna_samples: List of cna samples
        output_directory: Directory to write case lists
        study_id: cBioPortal study id

    Returns:
        cnaseq path
    """
    case_list_ids = "\t".join(cna_samples)
    cnaseq_caselist_path = os.path.abspath(
        os.path.join(output_directory, "cases_cnaseq.txt")
    )
    with open(cnaseq_caselist_path, "w") as case_list_file:
        case_list_file_text = CASE_LIST_TEXT_TEMPLATE.format(
            study_id=study_id,
            stable_id=study_id + "_cnaseq",
            case_list_name="Samples with CNA and mutation",
            case_list_description="Samples with CNA and mutation",
            case_list_ids=case_list_ids,
        )
        case_list_file.write(case_list_file_text)
    return cnaseq_caselist_path

main(clinical_file_name, assay_info_file_name, output_directory, study_id)

Gets clinical file and gene matrix file and processes it to obtain case list files

PARAMETER DESCRIPTION
clinical_file_name

Clinical file path

assay_info_file_name

Assay information name

output_directory

Output directory of case list files

study_id

cBioPortal study id

Source code in genie/create_case_lists.py
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
def main(clinical_file_name, assay_info_file_name, output_directory, study_id):
    """Gets clinical file and gene matrix file and processes it
    to obtain case list files

    Args:
        clinical_file_name: Clinical file path
        assay_info_file_name: Assay information name
        output_directory: Output directory of case list files
        study_id: cBioPortal study id
    """
    case_lists_map, seq_assay_map, clin_samples = create_case_lists_map(
        clinical_file_name
    )
    write_case_list_files(case_lists_map, output_directory, study_id)
    # create_sequenced_samples used to get the samples, but since the removal
    # of WES samples, we can no longer rely on the gene matrix file to grab
    # all sequenced samples, must use assay information file
    cna_samples, fusion_samples, sv_samples = create_sequenced_samples(
        seq_assay_map, assay_info_file_name
    )
    write_case_list_sequenced(clin_samples, output_directory, study_id)
    write_case_list_cna(cna_samples, output_directory, study_id)
    write_case_list_cnaseq(cna_samples, output_directory, study_id)
    write_case_list_sv(sv_samples, output_directory, study_id)