Source code for liquorice.utils.Plotting

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import typing
import matplotlib

from liquorice.utils import FitGaussians


[docs]def polyfit(x: typing.List[float], y: typing.List[float], degree: int) -> dict:
    """
    fit a trendline to data, see https://stackoverflow.com/questions/893657/how-do-i-calculate-r-squared-using-python-and-numpy

    :param x: x values
    :param y: y values
    :param degree: 1 for linear, 2 for quadratic, etc.
    :return: A Dictionary, containing lists with the keys 'polynomial' (polynomial coefficients) and 'determination'
        (The coefficient of determination).
    """
    results = {}

    coeffs = np.polyfit(x, y, degree)

    # Polynomial Coefficients
    results['polynomial'] = coeffs.tolist()

    # r-squared
    p = np.poly1d(coeffs)
    # fit values, and mean
    yhat = p(x)                         # or [p(z) for z in x]
    ybar = np.sum(y)/len(y)          # or sum(y)/len(y)
    ssreg = np.sum((yhat-ybar)**2)   # or sum([ (yihat - ybar)**2 for yihat in yhat])
    sstot = np.sum((y - ybar)**2)    # or sum([ (yi - ybar)**2 for yi in y])
    results['determination'] = ssreg / sstot
    return results


[docs]class Plotting:
    """
    An object used to generate plots of the aggragated coverage, differences between corrected and uncorrected
    coverage, as well as correlations between bias-factors and coverage.

    :param samplename: Name of the sample as it should appear in the plots
    :param out_dir: Path to the directory to which plots should be saved.
    """

    def __init__(self, samplename: str, out_dir: str) -> None:

        self.samplename=samplename
        self.out_dir=out_dir


[docs]    def plot_coverage_bias_correlation(self, df: pd.DataFrame, biasfactor_column: str, coverage_column: str,
                                       percent: bool = True, xmin: typing.Union[int,float] = 0,
                                       xmax: typing.Union[int,float] = 100,
                                       ymin: typing.Union[int,float] = 0,
                                       ymax: typing.Union[int,float] = 2,
                                       filename: typing.Union[str, None] = "auto",
                                       return_figure: bool = False) -> typing.Union[None,
                                       matplotlib.figure.Figure]:
        """
        Plot the correlation between coverage and a bias factor as a heatmap, and fit a trendline.

        :param df: `pandas.DataFrame` that contains **biasfactor_column** and  **coverage_column**.
        :param biasfactor_column: Name of the column that contains data for the bias-factor of interest
        :param coverage_column: Name of the column that contains the coverage data
        :param percent: Whether the output plot should multiply the x values by the factor 100
        :param xmin: min x coordinate to show in plot
        :param xmax: max y coordinate to show in plot
        :param ymin: min x coordinate to show in plot
        :param ymax: max y coordinate to show in plot
        :param filename: Filename to which the figure should be saved. "auto" sets this to
            " :attr:`.out_dir`/**biasfactor_column**__vs__**coverage_column**.pdf " (with spaces in column names
            replaced by '_'). Set to `None` to avoid saving as
            file. File extension determines in which format the plot is saved.
        :param return_figure: If True, return a `matplotlib figure` that can be altered, plotted, or saved.
        :return: A `matplotlib figure` if **return_figure** is True, nothing otherwise.
        """

        x=biasfactor_column
        y=coverage_column

        heatmap, xedges, yedges = np.histogram2d(df[x]*100 if percent else df[x],df[y], bins=(100,100),
                                                 range=[[xmin,xmax],[ymin,ymax]])
        extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
        plt.clf()
        plt.imshow(heatmap.T, extent=extent, origin='lower', aspect='auto')
        plt.ylabel(y)
        plt.xlabel(x)
        cbar=plt.colorbar()
        cbar.set_label('# of bins', rotation=270,labelpad=13)

        y_lim=plt.gca().get_ylim()

        plt.suptitle("Bias plot for %s"%(self.samplename))
        z = np.polyfit(df[x]*100, df[y], 2)
        f = np.poly1d(z)
        x_new=np.linspace(xmin,xmax,50)
        y_new = f(x_new)
        plt.plot(x_new, y_new, color="firebrick", linewidth=0.5, )
        plt.ylim(y_lim)

        r2=round(polyfit(df[x],df[y],2)["determination"],3)
        plt.title(r'$R^{2}$ = %s'%(r2,))
        plt.locator_params(nbins=2)
        if filename=="auto":
            plt.savefig(f"{self.out_dir}/{x.replace(' ','_')}__vs__{y.replace(' ','_')}.pdf")
        elif filename is not None:
            plt.savefig(filename)

        if return_figure:
            return plt.gcf()
        plt.close()

    none_or_list_of_float=typing.Union[None,typing.List[float]]
    none_or_list_of_int=typing.Union[None,typing.List[float]]
    none_or_list_of_str=typing.Union[None,typing.List[str]]


[docs]    def plot_correlations_biasfactors_coverage_bin_nr(self, df: pd.DataFrame,
                                                      x_varnames_list: none_or_list_of_str = None,
                                                      y_varnames_list: none_or_list_of_str = None,
                                                      bins: typing.Tuple[int,int]=(50,50),
                                                      return_figure: bool = False,
                                                      filename: str="bias_plots.pdf",
                                                      percentile_cutoffs: typing.Tuple[
                                                          int, int, int, int] = (0.1,0.99,0.1,0.99),
                                                      fit_regression: bool=True) -> typing.Union[None,
    matplotlib.figure.Figure]:
        """
        Plot the correlation between coverage and a bias factor as a heatmap, and fit a trendline.

        :param df: `pandas.DataFrame` that contains the variables in **x_varnames_list** and  **y_varnames_list**.
        :param x_varnames_list: List of the columns that should be plotted as independent variables. Default None uses
            all columns in the **df** except for ["chromosome","start","end","sequence","mappability"].
        :param y_varnames_list: List of the columns that should be plotted as dependent variables. Default None uses
            "coverage" and "bin nr.", plus "corrected coverage" if this column is present in the **df**.
        :param bins: Tuple specifying the number of bins to be used for the x and y axis
        :param return_figure: If True, return a `matplotlib figure` that can be altered, plotted, or saved.
        :param fit_regression: If True, fit a qadratic polynomial and plot the regression function and R^2.
        :param percentile_cutoffs: Tuple containing the min and max percentile of values that should be displayed on the
            x and y axes: (x_min,x_max,y_min,y_max). This setting is ignored for "bin nr.", for this column, all values
            are shown. Percentiles must be in the interval [0,1].
        :param filename: Filename to which the figure should be saved. File extension determines in which format the
            plot is saved.
        :return: A `matplotlib figure` if **return_figure** is True, nothing otherwise.
        """

        if y_varnames_list is None:
            y_varnames_list=(["corrected coverage"] if "corrected coverage" in df.columns else []) + \
                    ["coverage","bin nr."]
        if x_varnames_list is None:
            x_varnames_list=[col for col in df.columns if not col in
                                                              ["chromosome","start","end","sequence","mappability"]]

        for rownr,y in enumerate(y_varnames_list):
            for colnr,x in enumerate(x_varnames_list):
                plt.subplot(len(y_varnames_list), len(x_varnames_list), rownr * len(x_varnames_list) + colnr + 1)

                ymin=df[y].quantile(percentile_cutoffs[0]) if not y=="bin nr." else df[y].min()
                ymax=df[y].quantile(percentile_cutoffs[1]) if not y=="bin nr." else df[y].max()
                xmin=df[x].quantile(percentile_cutoffs[2]) if not x=="bin nr." else df[y].min()
                xmax=df[x].quantile(percentile_cutoffs[3]) if not x=="bin nr." else df[y].min()
                nr_of_bins=len(set(df["bin nr."].values))

                heatmap, xedges, yedges = np.histogram2d(df[x],df[y],
                                                         bins=(bins[0] if x!="bin nr." else list(range(0,nr_of_bins+1)),
                                                               bins[1] if y!="bin nr." else list(range(0,nr_of_bins+1))),
                                                         range=[[xmin,xmax],[ymin,ymax]])
                extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
                plt.imshow(heatmap.T, extent=extent, origin='lower', aspect='auto')

                plt.ylabel(y)
                plt.xlabel(x)
                cbar=plt.colorbar()
                cbar.set_label('# of bins', rotation=270,labelpad=13)
                plt.suptitle("Bias plot for %s" % self.samplename)
                y_lim=plt.gca().get_ylim()
                x_lim=plt.gca().get_xlim()

                if fit_regression:
                    df_tmp=df.copy().dropna()
                    z = np.polyfit(df_tmp[x], df_tmp[y], 2)
                    f = np.poly1d(z)
                    x_new=np.linspace(xmin,xmax,50)
                    y_new = f(x_new)
                    plt.plot(x_new, y_new, color="firebrick", linewidth=0.5, )
                    r2=round(polyfit(df_tmp[x],df_tmp[y],2)["determination"],3)
                    plt.title(r'$R^{2}$ = %s'%(r2,))

                # Show the mean as a seperate line
                if y=="bin nr." and not y==x:
                    plt.plot(df.groupby("bin nr.")[x].mean(),range(0,nr_of_bins),color="pink",linewidth=0.5)
                elif x=="bin nr." and not y==x:
                    plt.plot(range(0,nr_of_bins),df.groupby("bin nr.")[y].mean().values,color="pink",linewidth=0.5)
                else:
                    df_tmp=df.copy()
                    df_tmp["tmp bin"],borders=pd.cut(df_tmp[x],50,retbins=True) #create 50 bins based on the x variable
                    midpoints=[(borders[i]+borders[i+1])/2 for i in range(len(borders)-1)]
                    # plot the mean of the y variable for each bin
                    plt.plot(midpoints,df_tmp.groupby("tmp bin")[y].mean(),color="pink",linewidth=0.5)

                plt.ylim(y_lim)
                plt.xlim(x_lim)
                plt.locator_params(nbins=2)
                if colnr:
                    plt.ylabel("")

        plt.gcf().set_size_inches(len(x_varnames_list) * 3, len(y_varnames_list) * 3)
        plt.tight_layout()
        plt.subplots_adjust(hspace=0.5, wspace=0.5)

        if filename=="auto":
            plt.savefig(f"{self.out_dir}/{filename}")
        elif filename is not None:
            plt.savefig(filename)
        if return_figure:
            return plt.gcf()
        plt.close()

[docs]    def plot_fitted_model(self, fit_gaussians_obj: typing.Union[FitGaussians.FitGaussians,None] = None,
                          x_values: none_or_list_of_int = None, unfitted_y_values: none_or_list_of_float = None,
                          g1_y_values: none_or_list_of_float = None,
                          g2_y_values: none_or_list_of_float = None, g3_y_values: none_or_list_of_float = None,
                          intercept_y_values: none_or_list_of_float = None,
                          combined_model_y_values: none_or_list_of_float = None,
                          unfitted_y_values_label: none_or_list_of_float = "Corrected, aggregated coverage",
                          filename: typing.Union[str,None] = "auto",
                          return_figure: bool = False) -> typing.Union[None,
                          matplotlib.figure.Figure]:
        """
        Plots the fitted model, i.e. the raw data and lines for G1, G2, G3, the intercept, and the combined model.

        :param fit_gaussians_obj: object of class FitGaussians, on which fit_gaussian_models() has been called. This
            contains all nessesary information for plotting, so all other parameters except **filename** are ignored if
            this is set.
        :param x_values: List of x coordinates - the coordinates of the bins' centers relative to the center of the
            region of interest. Required if fit_gaussians_obj is not set, ignored otherwise.
        :param unfitted_y_values: List of y coordinates of the unfitted coverage data.
            Required if fit_gaussians_obj is not set, ignored otherwise.
        :param g1_y_values: List of y coordinates for the first, smallest gaussian.
            Required if fit_gaussians_obj is not set, ignored otherwise.
        :param g2_y_values: List of y coordinates for the second, middle gaussian.
            Required if fit_gaussians_obj is not set, ignored otherwise.
        :param g3_y_values: List of y coordinates for the first, largest gaussian.
            Required if fit_gaussians_obj is not set, ignored otherwise.
        :param intercept_y_values: List of y coordinates for the intercept.
          Required if fit_gaussians_obj is not set, ignored otherwise.
        :param combined_model_y_values: List of y coordinates for the fitted, combined model.
            Required if fit_gaussians_obj is not set, ignored otherwise.
        :param unfitted_y_values_label: Label for the unfitted_y_values. By default it is assumed that the corrected,
            aggregated, unfitted coverage is given.
        :param filename: Filename to which the figure should be saved. "auto" sets this to
            " :attr:`.out_dir`/fitted_gaussians.pdf ". Set to `None` to avoid saving as
            file. File extension determines in which format the plot is saved.
        :param return_figure: If True, return a `matplotlib figure` that can be altered, plotted, or saved.
        :return: A `matplotlib figure` if **return_figure** is True, nothing otherwise.
        """

        if fit_gaussians_obj is not None:
            x_values=fit_gaussians_obj.x_values
            unfitted_y_values=fit_gaussians_obj.unfitted_y_values
            g1_y_values=fit_gaussians_obj.g1_y_values
            g2_y_values=fit_gaussians_obj.g2_y_values
            g3_y_values=fit_gaussians_obj.g3_y_values
            intercept_y_values=fit_gaussians_obj.intercept_y_values
            combined_model_y_values=fit_gaussians_obj.combined_model_y_values

        if fit_gaussians_obj is None and any([param is None for param in [x_values,unfitted_y_values,g1_y_values,
            g2_y_values,g3_y_values,intercept_y_values,combined_model_y_values]]):
            raise TypeError("If fit_gaussians_obj is not specified (or set to None), than the parameters [x_values,"
                            "unfitted_y_values,g1_y_values, g2_y_values,g3_y_values,intercept_y_values,"
                            "combined_model_y_values] of this function must all be specified and be non-None values.")

        plt.plot(x_values,combined_model_y_values, label="Combined model", linewidth=1.3)
        plt.plot(x_values, g1_y_values,":",color="seagreen", label="First gaussian", alpha=0.5)
        plt.plot(x_values, g2_y_values,"-.",color="purple", label="Second gaussian",alpha=0.5)
        plt.plot(x_values, g3_y_values,"--",color="orange", label="Third gaussian",alpha=0.5)
        plt.plot(x_values, intercept_y_values ,"--",color="black", label="Intercept",linewidth=0.5)
        plt.plot(x_values,unfitted_y_values, "kx", alpha=0.35, label=unfitted_y_values_label)
        plt.title(f"Fitting of 3 Gaussian Dips and intercept to "
                  f"{unfitted_y_values_label[0].lower() + unfitted_y_values_label[1:]} for sample {self.samplename}")
        plt.xlabel("Distance from center of RoI [bp]")
        plt.ylabel(unfitted_y_values_label)
        plt.legend()
        fig=plt.gcf()
        fig.set_size_inches(10,5)
        if filename=="auto":
            fig.savefig(f"{self.out_dir}/fitted_gaussians.pdf")
        elif filename is not None:
            fig.savefig(filename)
        if return_figure:
            return fig
        plt.close()

[docs]    def plot_corrected_vs_uncorrected_coverage(self, x_values: typing.List[int],
                                               uncorrected_y_values: typing.List[float],
                                               corrected_y_values: typing.List[float],
                                               uncorrected_y_intercept: float, corrected_y_intercept: float,
                                               filename: typing.Union[str,None] ="auto",
                                               return_figure: bool = False) -> \
            typing.Union[None, matplotlib.figure.Figure]:
        """
        Plots the aggregated coverages, both corrected and uncorrected.

        :param x_values: List of x coordinates - the coordinates of the bins' centers relative to the center of the
            regions of interest.
        :param uncorrected_y_values: List of aggregated, uncorrected coverage values.
        :param corrected_y_values: List of aggregated, uncorrected coverage values.
        :param uncorrected_y_intercept: Intercept value that should be subtracted from every value in
            **uncorrected_y_values**
        :param corrected_y_intercept: Intercept value that should be subtracted from every value in
            **corrected_y_values**
        :param filename: Filename to which the figure should be saved. "auto" sets this to
            " :attr:`.out_dir`/corrected_vs_uncorrected_coverage.pdf ". Set to `None` to avoid saving as
            file. File extension determines in which format the plot is saved.
        :param return_figure: If True, return a `matplotlib figure` that can be altered, plotted, or saved.
        :return: A `matplotlib figure` if **return_figure** is True, nothing otherwise.

        Example:

        .. code-block:: python

            plotting.plot_corrected_vs_uncorrected_coverage(
                x_values=fit_gaussians_obj.x_values,
                uncorrected_y_values=fit_gaussians_obj_uncorrected.unfitted_y_values,
                corrected_y_values=fit_gaussians_obj.unfitted_y_values,
                uncorrected_y_intercept=fit_gaussians_obj_uncorrected.intercept_y_values[0],
                corrected_y_intercept=fit_gaussians_obj.intercept_y_values[0])
        """

        uncorrected_y_values=[y-uncorrected_y_intercept for y in uncorrected_y_values]
        corrected_y_values=[y-corrected_y_intercept for y in corrected_y_values]


        plt.plot(x_values,uncorrected_y_values, color="firebrick",label="Uncorrected, aggregated coverage",
                 linewidth=0.5,alpha=0.5,marker="x")
        plt.plot(x_values, corrected_y_values,color="navy", label="Corrected, aggregated coverage", alpha=0.5,
                 marker="x")

        plt.xlabel("Distance from center of RoI [bp]")
        plt.ylabel("Coverage, normalized by intercept")
        plt.title(f"Corrected vs uncorrected coverage for sample {self.samplename}")
        plt.legend(fontsize=8)

        plt.locator_params(axis='y', nbins=5)
        plt.locator_params(axis='x', nbins=5)

        if filename=="auto":
            plt.gcf().savefig(f"{self.out_dir}/corrected_vs_uncorrected_coverage.pdf")
        elif filename is not None:
            plt.gcf().savefig(f"{self.out_dir}/{filename}")
        if return_figure:
            return plt.gcf()
        plt.close()