Skip to content

transform

genie.transform

This module contains all the transformation functions used throughout the GENIE package

Functions

_col_name_to_titlecase(string)

Convert strings to titlecase. Supports strings separated by _.

PARAMETER DESCRIPTION
string

A string

TYPE: str

RETURNS DESCRIPTION
str

A string converted to title case

TYPE: str

Source code in genie/transform.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def _col_name_to_titlecase(string: str) -> str:
    """Convert strings to titlecase. Supports strings separated by _.

    Args:
        string (str): A string

    Returns:
        str: A string converted to title case

    """
    # This is a mapping of strings that are abbreviations
    abbrev_map = {"Dna_": "DNA_", "Rna_": "RNA_", "Sv_": "SV_", "Ncbi_": "NCBI_"}
    # The reason I split the string by _ and utilize the capitalize function
    # instead of using something like .title(), so outlined here:
    # https://stackoverflow.com/questions/1549641/how-can-i-capitalize-the-first-letter-of-each-word-in-a-string
    converted_str = "_".join([each.capitalize() for each in string.split("_")])
    for titlecase, abbrev in abbrev_map.items():
        converted_str = converted_str.replace(titlecase, abbrev)
    return converted_str

_convert_col_with_nas_to_str(df, col)

This converts a column into str while preserving NAs

Source code in genie/transform.py
32
33
34
35
def _convert_col_with_nas_to_str(df: pd.DataFrame, col: str) -> list:
    """This converts a column into str while preserving NAs"""
    new_vals = [str(val) if pd.notna(val) else val for val in df[col]]
    return new_vals

_convert_float_col_with_nas_to_int(df, col)

This converts int column that was turned into a float col because pandas does that with int values that have NAs back into an int col with NAs intact

Source code in genie/transform.py
38
39
40
41
42
43
44
45
46
def _convert_float_col_with_nas_to_int(df: pd.DataFrame, col: str) -> list:
    """This converts int column that was turned into a float col because
    pandas does that with int values that have NAs back into an int col
    with NAs intact"""
    if is_float_dtype(df[col]) and df[col].isnull().values.any():
        new_vals = df[col].astype(pd.Int64Dtype()).tolist()
        return new_vals
    else:
        return df[col].tolist()

_convert_df_with_mixed_dtypes(read_csv_params)

This checks if a dataframe read in normally comes out with mixed data types (which happens when low_memory = True because read_csv parses in chunks and guesses dtypes by chunk) and converts a dataframe with mixed datatypes to one datatype.

PARAMETER DESCRIPTION
read_csv_params

of input params and values to pandas's read_csv function. needs to include filepath to dataset to be read in

TYPE: dict

RETURNS DESCRIPTION
DataFrame

pd.DataFrame : The dataset read in

Source code in genie/transform.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def _convert_df_with_mixed_dtypes(read_csv_params: dict) -> pd.DataFrame:
    """This checks if a dataframe read in normally comes out with mixed data types (which happens
    when low_memory = True because read_csv parses in chunks and guesses dtypes by chunk) and
    converts a dataframe with mixed datatypes to one datatype.

    Args:
        read_csv_params (dict): of input params and values to pandas's read_csv function.
            needs to include filepath to dataset to be read in

    Returns:
        pd.DataFrame : The dataset read in
    """
    warnings.simplefilter("error", pd.errors.DtypeWarning)
    try:
        df = pd.read_csv(**read_csv_params, low_memory=True)
    except pd.errors.DtypeWarning:
        # setting engine to c as that is the only engine that works with low_memory=False
        df = pd.read_csv(**read_csv_params, low_memory=False, engine="c")
    warnings.resetwarnings()
    return df

_convert_values_to_na(input_df, values_to_replace, columns_to_convert)

Converts given values to NA in an input dataset

PARAMETER DESCRIPTION
input_df

input dataset

TYPE: DataFrame

values_to_replace

string values to replace with na

TYPE: List[str]

columns_to_convert

subset of columns to convert with na in

TYPE: List[str]

RETURNS DESCRIPTION
DataFrame

pd.DataFrame: dataset with specified values replaced with NAs

Source code in genie/transform.py
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def _convert_values_to_na(
    input_df: pd.DataFrame, values_to_replace: List[str], columns_to_convert: List[str]
) -> pd.DataFrame:
    """Converts given values to NA in an input dataset

    Args:
        input_df (pd.DataFrame): input dataset
        values_to_replace (List[str]): string values to replace with na
        columns_to_convert (List[str]): subset of columns to convert with na in

    Returns:
        pd.DataFrame: dataset with specified values replaced with NAs
    """
    if not input_df.empty:
        replace_mapping = {value: None for value in values_to_replace}
        input_df[columns_to_convert] = input_df[columns_to_convert].replace(
            replace_mapping
        )
    return input_df