Source code for liquorice.utils.MeanSequencingDepth

#import deeptools.countReadsPerBin as crpb
from liquorice.utils import deeptoolsCountReadsPerBinDontCheckProperPairSAMflag as crpb
import logging
import pandas as pd
import typing
import os


[docs]def sample_bam_to_get_sequencing_depth(bam_filepath: str, n_sites_to_sample: int = 10000, n_cores: int = 1, min_mapq: int = 20, chromosomes_list: typing.List[str] = None, **additional_crpb_kwargs: dict) -> float: """ Estimates the overall sequencing coverage of the .bam file by sampling sites that are regularily distributed across the genome (using :func:`deeptools.countReadsPerBin.CountReadsPerBin` in a slightly modified version that allows fragments without the SAM flag is_proper_pair.). :param bam_filepath: Path to the .bam file for which mean coverage should be calculated. :param n_sites_to_sample: Number of sites (length 1) to sample. :param n_cores: Number of cores to be used by :func:`deeptools.countReadsPerBin.CountReadsPerBin`. :param min_mapq: `minMappingQuality` setting for `deeptools.countReadsPerBin.CountReadsPerBin`. :param chromosomes_list: A list of chromosomes that should be analyzed. Regions on chromosomes that are not in this list will be excluded from analysis. Default None means ["chr1", ..., "chr22"]. :param additional_crpb_kwargs: Use to override the default parameters for :func:`deeptools.countReadsPerBin.CountReadsPerBin`. If not specified otherwise in `additional_crpb_kwargs`, the following attributes are passed to `CountReadsPerBin`: `ignoreDuplicates=True`, `samFlag_exclude=256`, `extendReads=True`. :return: The average coverage determined from the sampled regions. """ if chromosomes_list is None: chromosomes_list=["chr"+str(i) for i in range(1,23)] crpb_kwargs={ "minMappingQuality":min_mapq, "ignoreDuplicates":True, "numberOfProcessors":n_cores, "samFlag_exclude":256, # not primary alignment "extendReads":True} crpb_kwargs.update(additional_crpb_kwargs) logging.info(f"Calculating coverage at {n_sites_to_sample} regions" f" regularily distributed across the genome to determine overall coverage ...") crpb_obj = crpb.CountReadsPerBin( [bam_filepath], binLength=1, numberOfSamples=n_sites_to_sample, out_file_for_raw_data="sequencing_depth_at_sampled_regions.tsv", **crpb_kwargs) crpb_obj.run() seqdepth_df=pd.read_csv("sequencing_depth_at_sampled_regions.tsv",sep="\t",header=None) seqdepth_df.columns=["chr","start","end","coverage"] seqdepth_df["chr"]=seqdepth_df["chr"].astype("str") mean_seq_depth=seqdepth_df[seqdepth_df["chr"].isin(chromosomes_list)]["coverage"].mean() os.remove("sequencing_depth_at_sampled_regions.tsv") # mean_seq_depth=cr.run().mean(axis=0)[0] logging.info(f"Mean coverage at the {n_sites_to_sample} regions of length 1 is " f"{mean_seq_depth}.") return mean_seq_depth