Source code for liquorice.utils.Plotting

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import typing
import matplotlib

from liquorice.utils import FitGaussians


[docs]def polyfit(x: typing.List[float], y: typing.List[float], degree: int) -> dict: """ fit a trendline to data, see https://stackoverflow.com/questions/893657/how-do-i-calculate-r-squared-using-python-and-numpy :param x: x values :param y: y values :param degree: 1 for linear, 2 for quadratic, etc. :return: A Dictionary, containing lists with the keys 'polynomial' (polynomial coefficients) and 'determination' (The coefficient of determination). """ results = {} coeffs = np.polyfit(x, y, degree) # Polynomial Coefficients results['polynomial'] = coeffs.tolist() # r-squared p = np.poly1d(coeffs) # fit values, and mean yhat = p(x) # or [p(z) for z in x] ybar = np.sum(y)/len(y) # or sum(y)/len(y) ssreg = np.sum((yhat-ybar)**2) # or sum([ (yihat - ybar)**2 for yihat in yhat]) sstot = np.sum((y - ybar)**2) # or sum([ (yi - ybar)**2 for yi in y]) results['determination'] = ssreg / sstot return results
[docs]class Plotting: """ An object used to generate plots of the aggragated coverage, differences between corrected and uncorrected coverage, as well as correlations between bias-factors and coverage. :param samplename: Name of the sample as it should appear in the plots :param out_dir: Path to the directory to which plots should be saved. """ def __init__(self, samplename: str, out_dir: str) -> None: self.samplename=samplename self.out_dir=out_dir
[docs] def plot_coverage_bias_correlation(self, df: pd.DataFrame, biasfactor_column: str, coverage_column: str, percent: bool = True, xmin: typing.Union[int,float] = 0, xmax: typing.Union[int,float] = 100, ymin: typing.Union[int,float] = 0, ymax: typing.Union[int,float] = 2, filename: typing.Union[str, None] = "auto", return_figure: bool = False) -> typing.Union[None, matplotlib.figure.Figure]: """ Plot the correlation between coverage and a bias factor as a heatmap, and fit a trendline. :param df: `pandas.DataFrame` that contains **biasfactor_column** and **coverage_column**. :param biasfactor_column: Name of the column that contains data for the bias-factor of interest :param coverage_column: Name of the column that contains the coverage data :param percent: Whether the output plot should multiply the x values by the factor 100 :param xmin: min x coordinate to show in plot :param xmax: max y coordinate to show in plot :param ymin: min x coordinate to show in plot :param ymax: max y coordinate to show in plot :param filename: Filename to which the figure should be saved. "auto" sets this to " :attr:`.out_dir`/**biasfactor_column**__vs__**coverage_column**.pdf " (with spaces in column names replaced by '_'). Set to `None` to avoid saving as file. File extension determines in which format the plot is saved. :param return_figure: If True, return a `matplotlib figure` that can be altered, plotted, or saved. :return: A `matplotlib figure` if **return_figure** is True, nothing otherwise. """ x=biasfactor_column y=coverage_column heatmap, xedges, yedges = np.histogram2d(df[x]*100 if percent else df[x],df[y], bins=(100,100), range=[[xmin,xmax],[ymin,ymax]]) extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]] plt.clf() plt.imshow(heatmap.T, extent=extent, origin='lower', aspect='auto') plt.ylabel(y) plt.xlabel(x) cbar=plt.colorbar() cbar.set_label('# of bins', rotation=270,labelpad=13) y_lim=plt.gca().get_ylim() plt.suptitle("Bias plot for %s"%(self.samplename)) z = np.polyfit(df[x]*100, df[y], 2) f = np.poly1d(z) x_new=np.linspace(xmin,xmax,50) y_new = f(x_new) plt.plot(x_new, y_new, color="firebrick", linewidth=0.5, ) plt.ylim(y_lim) r2=round(polyfit(df[x],df[y],2)["determination"],3) plt.title(r'$R^{2}$ = %s'%(r2,)) plt.locator_params(nbins=2) if filename=="auto": plt.savefig(f"{self.out_dir}/{x.replace(' ','_')}__vs__{y.replace(' ','_')}.pdf") elif filename is not None: plt.savefig(filename) if return_figure: return plt.gcf() plt.close()
none_or_list_of_float=typing.Union[None,typing.List[float]] none_or_list_of_int=typing.Union[None,typing.List[float]] none_or_list_of_str=typing.Union[None,typing.List[str]]
[docs] def plot_correlations_biasfactors_coverage_bin_nr(self, df: pd.DataFrame, x_varnames_list: none_or_list_of_str = None, y_varnames_list: none_or_list_of_str = None, bins: typing.Tuple[int,int]=(50,50), return_figure: bool = False, filename: str="bias_plots.pdf", percentile_cutoffs: typing.Tuple[ int, int, int, int] = (0.1,0.99,0.1,0.99), fit_regression: bool=True) -> typing.Union[None, matplotlib.figure.Figure]: """ Plot the correlation between coverage and a bias factor as a heatmap, and fit a trendline. :param df: `pandas.DataFrame` that contains the variables in **x_varnames_list** and **y_varnames_list**. :param x_varnames_list: List of the columns that should be plotted as independent variables. Default None uses all columns in the **df** except for ["chromosome","start","end","sequence","mappability"]. :param y_varnames_list: List of the columns that should be plotted as dependent variables. Default None uses "coverage" and "bin nr.", plus "corrected coverage" if this column is present in the **df**. :param bins: Tuple specifying the number of bins to be used for the x and y axis :param return_figure: If True, return a `matplotlib figure` that can be altered, plotted, or saved. :param fit_regression: If True, fit a qadratic polynomial and plot the regression function and R^2. :param percentile_cutoffs: Tuple containing the min and max percentile of values that should be displayed on the x and y axes: (x_min,x_max,y_min,y_max). This setting is ignored for "bin nr.", for this column, all values are shown. Percentiles must be in the interval [0,1]. :param filename: Filename to which the figure should be saved. File extension determines in which format the plot is saved. :return: A `matplotlib figure` if **return_figure** is True, nothing otherwise. """ if y_varnames_list is None: y_varnames_list=(["corrected coverage"] if "corrected coverage" in df.columns else []) + \ ["coverage","bin nr."] if x_varnames_list is None: x_varnames_list=[col for col in df.columns if not col in ["chromosome","start","end","sequence","mappability"]] for rownr,y in enumerate(y_varnames_list): for colnr,x in enumerate(x_varnames_list): plt.subplot(len(y_varnames_list), len(x_varnames_list), rownr * len(x_varnames_list) + colnr + 1) ymin=df[y].quantile(percentile_cutoffs[0]) if not y=="bin nr." else df[y].min() ymax=df[y].quantile(percentile_cutoffs[1]) if not y=="bin nr." else df[y].max() xmin=df[x].quantile(percentile_cutoffs[2]) if not x=="bin nr." else df[y].min() xmax=df[x].quantile(percentile_cutoffs[3]) if not x=="bin nr." else df[y].min() nr_of_bins=len(set(df["bin nr."].values)) heatmap, xedges, yedges = np.histogram2d(df[x],df[y], bins=(bins[0] if x!="bin nr." else list(range(0,nr_of_bins+1)), bins[1] if y!="bin nr." else list(range(0,nr_of_bins+1))), range=[[xmin,xmax],[ymin,ymax]]) extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]] plt.imshow(heatmap.T, extent=extent, origin='lower', aspect='auto') plt.ylabel(y) plt.xlabel(x) cbar=plt.colorbar() cbar.set_label('# of bins', rotation=270,labelpad=13) plt.suptitle("Bias plot for %s" % self.samplename) y_lim=plt.gca().get_ylim() x_lim=plt.gca().get_xlim() if fit_regression: df_tmp=df.copy().dropna() z = np.polyfit(df_tmp[x], df_tmp[y], 2) f = np.poly1d(z) x_new=np.linspace(xmin,xmax,50) y_new = f(x_new) plt.plot(x_new, y_new, color="firebrick", linewidth=0.5, ) r2=round(polyfit(df_tmp[x],df_tmp[y],2)["determination"],3) plt.title(r'$R^{2}$ = %s'%(r2,)) # Show the mean as a seperate line if y=="bin nr." and not y==x: plt.plot(df.groupby("bin nr.")[x].mean(),range(0,nr_of_bins),color="pink",linewidth=0.5) elif x=="bin nr." and not y==x: plt.plot(range(0,nr_of_bins),df.groupby("bin nr.")[y].mean().values,color="pink",linewidth=0.5) else: df_tmp=df.copy() df_tmp["tmp bin"],borders=pd.cut(df_tmp[x],50,retbins=True) #create 50 bins based on the x variable midpoints=[(borders[i]+borders[i+1])/2 for i in range(len(borders)-1)] # plot the mean of the y variable for each bin plt.plot(midpoints,df_tmp.groupby("tmp bin")[y].mean(),color="pink",linewidth=0.5) plt.ylim(y_lim) plt.xlim(x_lim) plt.locator_params(nbins=2) if colnr: plt.ylabel("") plt.gcf().set_size_inches(len(x_varnames_list) * 3, len(y_varnames_list) * 3) plt.tight_layout() plt.subplots_adjust(hspace=0.5, wspace=0.5) if filename=="auto": plt.savefig(f"{self.out_dir}/{filename}") elif filename is not None: plt.savefig(filename) if return_figure: return plt.gcf() plt.close()
[docs] def plot_fitted_model(self, fit_gaussians_obj: typing.Union[FitGaussians.FitGaussians,None] = None, x_values: none_or_list_of_int = None, unfitted_y_values: none_or_list_of_float = None, g1_y_values: none_or_list_of_float = None, g2_y_values: none_or_list_of_float = None, g3_y_values: none_or_list_of_float = None, intercept_y_values: none_or_list_of_float = None, combined_model_y_values: none_or_list_of_float = None, unfitted_y_values_label: none_or_list_of_float = "Corrected, aggregated coverage", filename: typing.Union[str,None] = "auto", return_figure: bool = False) -> typing.Union[None, matplotlib.figure.Figure]: """ Plots the fitted model, i.e. the raw data and lines for G1, G2, G3, the intercept, and the combined model. :param fit_gaussians_obj: object of class FitGaussians, on which fit_gaussian_models() has been called. This contains all nessesary information for plotting, so all other parameters except **filename** are ignored if this is set. :param x_values: List of x coordinates - the coordinates of the bins' centers relative to the center of the region of interest. Required if fit_gaussians_obj is not set, ignored otherwise. :param unfitted_y_values: List of y coordinates of the unfitted coverage data. Required if fit_gaussians_obj is not set, ignored otherwise. :param g1_y_values: List of y coordinates for the first, smallest gaussian. Required if fit_gaussians_obj is not set, ignored otherwise. :param g2_y_values: List of y coordinates for the second, middle gaussian. Required if fit_gaussians_obj is not set, ignored otherwise. :param g3_y_values: List of y coordinates for the first, largest gaussian. Required if fit_gaussians_obj is not set, ignored otherwise. :param intercept_y_values: List of y coordinates for the intercept. Required if fit_gaussians_obj is not set, ignored otherwise. :param combined_model_y_values: List of y coordinates for the fitted, combined model. Required if fit_gaussians_obj is not set, ignored otherwise. :param unfitted_y_values_label: Label for the unfitted_y_values. By default it is assumed that the corrected, aggregated, unfitted coverage is given. :param filename: Filename to which the figure should be saved. "auto" sets this to " :attr:`.out_dir`/fitted_gaussians.pdf ". Set to `None` to avoid saving as file. File extension determines in which format the plot is saved. :param return_figure: If True, return a `matplotlib figure` that can be altered, plotted, or saved. :return: A `matplotlib figure` if **return_figure** is True, nothing otherwise. """ if fit_gaussians_obj is not None: x_values=fit_gaussians_obj.x_values unfitted_y_values=fit_gaussians_obj.unfitted_y_values g1_y_values=fit_gaussians_obj.g1_y_values g2_y_values=fit_gaussians_obj.g2_y_values g3_y_values=fit_gaussians_obj.g3_y_values intercept_y_values=fit_gaussians_obj.intercept_y_values combined_model_y_values=fit_gaussians_obj.combined_model_y_values if fit_gaussians_obj is None and any([param is None for param in [x_values,unfitted_y_values,g1_y_values, g2_y_values,g3_y_values,intercept_y_values,combined_model_y_values]]): raise TypeError("If fit_gaussians_obj is not specified (or set to None), than the parameters [x_values," "unfitted_y_values,g1_y_values, g2_y_values,g3_y_values,intercept_y_values," "combined_model_y_values] of this function must all be specified and be non-None values.") plt.plot(x_values,combined_model_y_values, label="Combined model", linewidth=1.3) plt.plot(x_values, g1_y_values,":",color="seagreen", label="First gaussian", alpha=0.5) plt.plot(x_values, g2_y_values,"-.",color="purple", label="Second gaussian",alpha=0.5) plt.plot(x_values, g3_y_values,"--",color="orange", label="Third gaussian",alpha=0.5) plt.plot(x_values, intercept_y_values ,"--",color="black", label="Intercept",linewidth=0.5) plt.plot(x_values,unfitted_y_values, "kx", alpha=0.35, label=unfitted_y_values_label) plt.title(f"Fitting of 3 Gaussian Dips and intercept to " f"{unfitted_y_values_label[0].lower() + unfitted_y_values_label[1:]} for sample {self.samplename}") plt.xlabel("Distance from center of RoI [bp]") plt.ylabel(unfitted_y_values_label) plt.legend() fig=plt.gcf() fig.set_size_inches(10,5) if filename=="auto": fig.savefig(f"{self.out_dir}/fitted_gaussians.pdf") elif filename is not None: fig.savefig(filename) if return_figure: return fig plt.close()
[docs] def plot_corrected_vs_uncorrected_coverage(self, x_values: typing.List[int], uncorrected_y_values: typing.List[float], corrected_y_values: typing.List[float], uncorrected_y_intercept: float, corrected_y_intercept: float, filename: typing.Union[str,None] ="auto", return_figure: bool = False) -> \ typing.Union[None, matplotlib.figure.Figure]: """ Plots the aggregated coverages, both corrected and uncorrected. :param x_values: List of x coordinates - the coordinates of the bins' centers relative to the center of the regions of interest. :param uncorrected_y_values: List of aggregated, uncorrected coverage values. :param corrected_y_values: List of aggregated, uncorrected coverage values. :param uncorrected_y_intercept: Intercept value that should be subtracted from every value in **uncorrected_y_values** :param corrected_y_intercept: Intercept value that should be subtracted from every value in **corrected_y_values** :param filename: Filename to which the figure should be saved. "auto" sets this to " :attr:`.out_dir`/corrected_vs_uncorrected_coverage.pdf ". Set to `None` to avoid saving as file. File extension determines in which format the plot is saved. :param return_figure: If True, return a `matplotlib figure` that can be altered, plotted, or saved. :return: A `matplotlib figure` if **return_figure** is True, nothing otherwise. Example: .. code-block:: python plotting.plot_corrected_vs_uncorrected_coverage( x_values=fit_gaussians_obj.x_values, uncorrected_y_values=fit_gaussians_obj_uncorrected.unfitted_y_values, corrected_y_values=fit_gaussians_obj.unfitted_y_values, uncorrected_y_intercept=fit_gaussians_obj_uncorrected.intercept_y_values[0], corrected_y_intercept=fit_gaussians_obj.intercept_y_values[0]) """ uncorrected_y_values=[y-uncorrected_y_intercept for y in uncorrected_y_values] corrected_y_values=[y-corrected_y_intercept for y in corrected_y_values] plt.plot(x_values,uncorrected_y_values, color="firebrick",label="Uncorrected, aggregated coverage", linewidth=0.5,alpha=0.5,marker="x") plt.plot(x_values, corrected_y_values,color="navy", label="Corrected, aggregated coverage", alpha=0.5, marker="x") plt.xlabel("Distance from center of RoI [bp]") plt.ylabel("Coverage, normalized by intercept") plt.title(f"Corrected vs uncorrected coverage for sample {self.samplename}") plt.legend(fontsize=8) plt.locator_params(axis='y', nbins=5) plt.locator_params(axis='x', nbins=5) if filename=="auto": plt.gcf().savefig(f"{self.out_dir}/corrected_vs_uncorrected_coverage.pdf") elif filename is not None: plt.gcf().savefig(f"{self.out_dir}/{filename}") if return_figure: return plt.gcf() plt.close()