Source code for liquorice.utils.CorrectForCNAs

import pandas as pd
import sys
import logging
import numpy as np
import os
os.environ["PYTHONPATH"] = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')) # required by Ray, which is
# used by modin
import modin.pandas as modinpd
import swifter

bin_nr_without_unique_matching_segment=0

[docs]def correct_coverage_per_bin_for_cnas(df: pd.DataFrame,cna_seg_filepath:str,n_cores:int = 1) -> pd.DataFrame:
    """
    Extracts the log2(observed/expected) read depth ratio for the corresponding segment for each bin, and corrects each
    bin's coverage accordingly.

    :param df: A `pandas.DataFrame` with at least the following columns: "chromosome", "start","end","coverage"
    :param cna_seg_filepath: A .seg file that should be used to correct the coverage by the values specified in
       this file.  File must be tab-separated, with column names as first line. The second, third and fourth
       column must be chromosome, start, and end of the segment, and the last column must be the log2-ratio of
       observed/expected read depth.
    :param n_cores: How many cores to use to parallelize the operation.
    :return: A `pandas.DataFrame` similar to the input *df*, with the values in the column
        "coverage" changed according to the CNA correction factor of each bin, and an additional column
        "CNA-uncorrected coverage" that contains the original coverage values.
    """
    logging.info(f"Correcting the coverage for CNAs, using '{cna_seg_filepath}' ...")
    try:
        seg_df=pd.read_csv(cna_seg_filepath,sep="\t")
        seg_df=seg_df[seg_df.columns[[1,2,3,-1]]]
        seg_df.columns=["chromosome", "start","end","1/log2_correction_factor"]
        seg_df["chromosome"]=seg_df["chromosome"].astype("str")

    except IndexError:
        sys.exit(f"Error: There is something wrong with the cna_seg_filepath (or --cna_seg_file) '{cna_seg_filepath}'. "
                 "The file must be tab-separated, with column names as first line. The second, third and fourth "
                 "column must be chromosome, start, and end of the segment, and the last column must be the log2-ratio "
                 "of observed/expected read depth.")

    df["CNA-uncorrected coverage"]=df["coverage"].values
    global bin_nr_without_unique_matching_segment
    bin_nr_without_unique_matching_segment=0
    if n_cores==1:
        df["coverage"]=df.apply(lambda row: get_CNV_corrected_coverage_for_bin(bin_chrom=row["chromosome"],
                                                                               bin_start=row["start"],
                                                                               bin_end=row["end"],
                                                                               bin_coverage=row["coverage"],
                                                                               seg_df=seg_df), axis=1)
    else:
        df["coverage"]=df.swifter.progress_bar(False).set_npartitions(
            n_cores).apply(lambda row: get_CNV_corrected_coverage_for_bin(bin_chrom=row["chromosome"],
                                                                          bin_start=row["start"],
                                                                          bin_end=row["end"],
                                                                          bin_coverage=row["coverage"],
                                                                          seg_df=seg_df), axis=1)
    if bin_nr_without_unique_matching_segment:
        logging.info(f"Could not find a unique segment for CNA-correction for {bin_nr_without_unique_matching_segment} "
                     f" bins ({round(100*(bin_nr_without_unique_matching_segment/df.shape[0]),1)}% of total bins)."
                     f" This happens if a bin overlaps two different segments, "
                     f"the bin is in a region that is blacklisted/excluded by the CNA detection program, "
                     f"or if the .seg file is incomplete. CNA correction was skipped for these bins.")
    return df


[docs]def get_CNV_corrected_coverage_for_bin(bin_chrom: str, bin_start:int, bin_end:int, bin_coverage:float, seg_df: pd.DataFrame) -> float:
    """
    For a given bin, returns the coverage corrected for copy number aberrations.

    :param bin_chrom: Chromosome of the bin.
    :param bin_start: Genomic start position of the bin.
    :param bin_end: Genomic end position of the bin.
    :param bin_coverage: Bin's coverage before CNA correction
    :param seg_df: A `pandas.DataFrame` of a .seg file, with the following columns: "chromosome", "start","end",
        "1/log2_correction_factor".
    :return: A float indicating the CNV-corrected coverage for the bin.
    """
    global bin_nr_without_unique_matching_segment
    seg_df=seg_df[(seg_df["chromosome"]==bin_chrom) & (seg_df["start"]<bin_start) & (seg_df["end"]>bin_end)]
    if seg_df.shape[0]!=1:
        bin_nr_without_unique_matching_segment+=1
        logging.debug(f"Could not find a unique segment for CNA-correction for bin with coordinates "
                        f"{bin_chrom}:{bin_start}-{bin_end}. This can happen if a bin overlaps two different segments,"
                        f"the bin is in a region that is blacklisted/excluded by the CNA detection program,"
                        f"or if the .seg file is incomplete. Skipping CNA correction for this bin.")
        return bin_coverage
    else:
        return bin_coverage/np.power(2,seg_df["1/log2_correction_factor"].values[0])