Source code for liquorice.utils.CorrectForCNAs

import pandas as pd
import sys
import logging
import numpy as np
import os
os.environ["PYTHONPATH"] = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')) # required by Ray, which is
# used by modin
import modin.pandas as modinpd
import swifter

bin_nr_without_unique_matching_segment=0

[docs]def correct_coverage_per_bin_for_cnas(df: pd.DataFrame,cna_seg_filepath:str,n_cores:int = 1) -> pd.DataFrame: """ Extracts the log2(observed/expected) read depth ratio for the corresponding segment for each bin, and corrects each bin's coverage accordingly. :param df: A `pandas.DataFrame` with at least the following columns: "chromosome", "start","end","coverage" :param cna_seg_filepath: A .seg file that should be used to correct the coverage by the values specified in this file. File must be tab-separated, with column names as first line. The second, third and fourth column must be chromosome, start, and end of the segment, and the last column must be the log2-ratio of observed/expected read depth. :param n_cores: How many cores to use to parallelize the operation. :return: A `pandas.DataFrame` similar to the input *df*, with the values in the column "coverage" changed according to the CNA correction factor of each bin, and an additional column "CNA-uncorrected coverage" that contains the original coverage values. """ logging.info(f"Correcting the coverage for CNAs, using '{cna_seg_filepath}' ...") try: seg_df=pd.read_csv(cna_seg_filepath,sep="\t") seg_df=seg_df[seg_df.columns[[1,2,3,-1]]] seg_df.columns=["chromosome", "start","end","1/log2_correction_factor"] seg_df["chromosome"]=seg_df["chromosome"].astype("str") except IndexError: sys.exit(f"Error: There is something wrong with the cna_seg_filepath (or --cna_seg_file) '{cna_seg_filepath}'. " "The file must be tab-separated, with column names as first line. The second, third and fourth " "column must be chromosome, start, and end of the segment, and the last column must be the log2-ratio " "of observed/expected read depth.") df["CNA-uncorrected coverage"]=df["coverage"].values global bin_nr_without_unique_matching_segment bin_nr_without_unique_matching_segment=0 if n_cores==1: df["coverage"]=df.apply(lambda row: get_CNV_corrected_coverage_for_bin(bin_chrom=row["chromosome"], bin_start=row["start"], bin_end=row["end"], bin_coverage=row["coverage"], seg_df=seg_df), axis=1) else: df["coverage"]=df.swifter.progress_bar(False).set_npartitions( n_cores).apply(lambda row: get_CNV_corrected_coverage_for_bin(bin_chrom=row["chromosome"], bin_start=row["start"], bin_end=row["end"], bin_coverage=row["coverage"], seg_df=seg_df), axis=1) if bin_nr_without_unique_matching_segment: logging.info(f"Could not find a unique segment for CNA-correction for {bin_nr_without_unique_matching_segment} " f" bins ({round(100*(bin_nr_without_unique_matching_segment/df.shape[0]),1)}% of total bins)." f" This happens if a bin overlaps two different segments, " f"the bin is in a region that is blacklisted/excluded by the CNA detection program, " f"or if the .seg file is incomplete. CNA correction was skipped for these bins.") return df
[docs]def get_CNV_corrected_coverage_for_bin(bin_chrom: str, bin_start:int, bin_end:int, bin_coverage:float, seg_df: pd.DataFrame) -> float: """ For a given bin, returns the coverage corrected for copy number aberrations. :param bin_chrom: Chromosome of the bin. :param bin_start: Genomic start position of the bin. :param bin_end: Genomic end position of the bin. :param bin_coverage: Bin's coverage before CNA correction :param seg_df: A `pandas.DataFrame` of a .seg file, with the following columns: "chromosome", "start","end", "1/log2_correction_factor". :return: A float indicating the CNV-corrected coverage for the bin. """ global bin_nr_without_unique_matching_segment seg_df=seg_df[(seg_df["chromosome"]==bin_chrom) & (seg_df["start"]<bin_start) & (seg_df["end"]>bin_end)] if seg_df.shape[0]!=1: bin_nr_without_unique_matching_segment+=1 logging.debug(f"Could not find a unique segment for CNA-correction for bin with coordinates " f"{bin_chrom}:{bin_start}-{bin_end}. This can happen if a bin overlaps two different segments," f"the bin is in a region that is blacklisted/excluded by the CNA detection program," f"or if the .seg file is incomplete. Skipping CNA correction for this bin.") return bin_coverage else: return bin_coverage/np.power(2,seg_df["1/log2_correction_factor"].values[0])