Source code for liquorice.utils.FitGaussians

import numpy as np
from lmfit.models import GaussianModel, ConstantModel
import pandas as pd
import logging
import typing


[docs]class FitGaussians: """ An object that can be used for quantification of coverage drops by fitting gaussian functions and an intercept to the coverage values. :param unfitted_y_values: A pandas `Series` of coverage values, one value per bin (and sorted by increasing bin nr.). A suitable input is the result of :func:`liquorice.utils.AggregateAcrossrRegions.aggregate_across_regions()`. :param extend_to: `extend_to` setting that was used when calculating the y values. Needed to calculate relative bin positions. :param binsize: `binsize` setting that was used when calculating the y values. Needed to calculate relative bin positions. :param samplename: If not `None`, add this in a column "sample" to the result `DataFrame` when calling :func:`.fit_gaussian_models` :param regionset_name: If not `None`, add this in a column "region-set" to the result `DataFrame` when calling :func:`.fit_gaussian_models` :param avg_centersize: Average size (i.e. width in bp) of the regions of interest (before extension by extend_to). """ def __init__(self, unfitted_y_values: typing.List[float], extend_to: int, binsize: int, avg_centersize: float, samplename: str = None, regionset_name: str = None,) -> None: self.extend_to=extend_to self.binsize=binsize self.unfitted_y_values=unfitted_y_values self.avg_centersize=int(avg_centersize) self.samplename=samplename self.regionset_name=regionset_name self.x_values=None self.intercept_y_values=None self.g1_y_values=None self.g2_y_values=None self.g3_y_values=None self.combined_model_y_values=None
[docs] def fit_gaussian_models(self, g1_min_sigma: typing.Union[int,float] = 20, g1_max_sigma: typing.Union[int,float] = 200, g2_min_sigma: typing.Union[int,float] = 200, g2_max_sigma: typing.Union[int,float] = 3000, g3_min_sigma: typing.Union[int,float] = 3000, g3_max_sigma: typing.Union[int,float] = 40000, method: str = "leastsq") -> pd.DataFrame: """ Fits three gaussian functions to :attr:`.unfitted_y_values` (usually aggregated coverage values), based on given limits. Sets :attr:`x_values`, :attr:`g1_y_values`, :attr:`g2_y_values`, :attr:`g3_y_values`, :attr:`intercept_y_values` and :attr:`combined_model_y_values` for the object. :param g1_min_sigma: Min σ value for the smallest gaussian :param g1_max_sigma: Max σ value for the smallest gaussian :param g2_min_sigma: Min σ value for the middle gaussian :param g2_max_sigma: Max σ value for the middle gaussian :param g3_min_sigma: Min σ value for the widest gaussian :param g3_max_sigma: Max σ value for the widest gaussian :param method: A method for fitting the combined model, used as parameter in lmfit's .fit() function. :return: A `pandas.DataFrame` containing several summary metrics based on the fitted model: The amplitude and σ values for each of the three gaussians, the total depth of the fitted model at the central bin, the Baysian Information Criterion (measures model fit), and the area under the curve of the fitted model, calculated as the area between the fitted intercept and the fitted model. If :attr:`samplename` or :attr:`regionset_name` are not `None`, columns "sample" or "region-set" are added, respectively. """ logging.info("Fitting gaussian functions and intercept to coverage values ...") y=self.unfitted_y_values avg_centersize=self.avg_centersize central_bin_x_values=[x*avg_centersize-avg_centersize/2 for x in [0.05,0.175,0.5,0.825,0.95]] # try if all bins have the same size: x=list(np.arange(-self.extend_to,self.extend_to+1, self.binsize)) # if not, assume that the central bins are smaller and have the coordinates in central_bin_x_values if len(x)!=len(y): x=list(np.arange(-self.extend_to+(self.binsize/2)-avg_centersize/2,(self.binsize/2)-avg_centersize/2,self.binsize))+central_bin_x_values+list(np.arange(avg_centersize/2+(self.binsize/2),self.extend_to+avg_centersize/2+(self.binsize/2),self.binsize)) assert len(x)==len(y) comb_dipmodel=GaussianModel(prefix='G1_')+GaussianModel(prefix='G2_')+GaussianModel(prefix='G3_')+\ ConstantModel(prefix="Const_") comb_dipmodel.set_param_hint('G1_center',value=0,vary=False) comb_dipmodel.set_param_hint('G2_center',value=0,vary=False) comb_dipmodel.set_param_hint('G3_center',value=0,vary=False) comb_dipmodel.set_param_hint('G1_sigma',value=(g1_min_sigma+g1_max_sigma)/2,min=g1_min_sigma, max=g1_max_sigma) comb_dipmodel.set_param_hint('G2_sigma',value=(g2_min_sigma+g2_max_sigma)/2,min=g2_min_sigma, max=g2_max_sigma) comb_dipmodel.set_param_hint('G3_sigma',value=(g3_min_sigma+g3_max_sigma)/2,min=g3_min_sigma, max=g3_max_sigma) comb_dipmodel.set_param_hint('Const_c',value=0) comb_dipmodel_paramas=comb_dipmodel.make_params() comb_dipmodel=comb_dipmodel.fit(data=y,x=x,params=comb_dipmodel_paramas, method=method) comps = comb_dipmodel.eval_components() # Sort the gaussians such that the narrowest one is the first: sortedgaussians=[("G1_",comb_dipmodel.best_values["G1_sigma"]), ("G2_",comb_dipmodel.best_values["G2_sigma"]),("G3_",comb_dipmodel.best_values["G3_sigma"])] sortedgaussians=sorted(sortedgaussians,key=lambda x: x[1]) self.x_values=x self.g1_y_values=comps[sortedgaussians[0][0]] self.g2_y_values=comps[sortedgaussians[1][0]] self.g3_y_values=comps[sortedgaussians[2][0]] self.intercept_y_values=[comps["Const_"] for _ in x] self.combined_model_y_values=comb_dipmodel.best_fit sum_g_heights=comb_dipmodel.params["G1_height"].value+comb_dipmodel.params["G2_height"].value+\ comb_dipmodel.params["G3_height"].value sampledict={} for key, value in comb_dipmodel.params.valuesdict().items(): if "center" in key: # center is 0 anyway continue # this is required such that the narrow gaussian is always the G1: if "G" in key and "amplitude" in key or "sigma" in key: for index,(name,sigma) in enumerate(sortedgaussians): # key is, e.g. G1_height. If G2 is the second-most narrow peak, rename G1_height to G2_height if name.startswith(key[:2]): sampledict["G"+str(index+1)+key[2:]]=value sampledict["Baysian Information Criterion"]=comb_dipmodel.bic sampledict["Total dip depth"]=-sum_g_heights sampledict["Intercept"]=comb_dipmodel.best_values["Const_c"] # calculate area under the curve, taking the intercept as a reference y_vals_for_AOC=[y-comps["Const_"] for y in comb_dipmodel.best_fit] sampledict["Total dip area (AOC combined model)"]=np.trapz(x=x,y=y_vals_for_AOC) param_df=pd.Series(sampledict).to_frame().T if self.samplename is not None: param_df["sample"]=self.samplename if self.regionset_name is not None: param_df["region-set"]=self.regionset_name return param_df