Source code for liquorice.utils.BiasFactorWeights

import numpy as np
from Bio import Seq
from typing import Dict,List,Tuple

[docs]def get_GC_weights_binsize1(fragments: List[int]) -> List[float]: """ For a given fragment size distribution, returns a list corresponding to the GC weight vector for a bin of size 1 :param fragments: A list of fragment lengths that is representative of the sample's global fragment length distribution :return: A list corresponding to the GC weight vector for a bin of size 1. Its length is `max(fragments)*2`, the weight corresponding to the bin itself is at index `max(fragments)`. Values represent influence of a nucleotide at a given position on the bin's coverage. """ longest_fraglen=max(fragments) weights_GC_binsize1=[] for pos in range(-longest_fraglen,longest_fraglen): weight=sum([fraglen-abs(pos) for fraglen in fragments if fraglen>=abs(pos)]) weights_GC_binsize1.append(weight) return weights_GC_binsize1
[docs]def get_GC_weights_binwide(binsize: int, fragments: List[int], dont_slide: bool = False) -> List[float]: """ For a given binsize, calculate the influence of a nucleotide at a given position on the bin's coverage :param binsize: Size of the bin :param fragments: A list of fragment lengths that is representative of the samples global fragment length distribution :param dont_slide: Set to False to return weights of 1 in the bin area and 0 outside of it :return: A list of length `(2*max(fragments)+binsize)`, the weight corresponding to the first position within the bin itself is at index `max(fragments)`. Values represent influence of a nucleotide at a given position on the bin's coverage. """ weights_GC_binsize1=get_GC_weights_binsize1(fragments) longest_fraglen=max(fragments) # needs to be binsize longer than the weights if center is just a single position wide GC_weights_for_bin=[ [] for i in range (len(weights_GC_binsize1)+binsize)] # run through all postitions ("centers") within the bin for center in range(binsize): # the leftmost center just appends its weights to the list, the next appends them shifted by 1 pos and so on for pos in range(len(weights_GC_binsize1)): GC_weights_for_bin[center+pos].append(weights_GC_binsize1[pos]) # to make the calculation of the mean meaningful, zeros need to be appended at positions where a center is not reached # by any fragment # do this for the positions that are too far downstream for pos in range(center): GC_weights_for_bin[pos].append(0) # and too far upstream for pos in range(binsize-center): GC_weights_for_bin[len(GC_weights_for_bin)-1-pos].append(0) GC_weights_for_bin=[np.mean(x) for x in GC_weights_for_bin] if not dont_slide: return GC_weights_for_bin else: return [0 for i in range(longest_fraglen)]+[1 for i in range(binsize)]+[0 for i in range(longest_fraglen)]
[docs]def get_dinuc_weights_binwide(GC_weights: List[float]) -> List[float]: """ For a given binsize, calculate the influence of a given dinucleotide starting at a given position on the bin's coverage. This simply averages the GC weight of the position of interest and the following position. For the last position, a weight of 0 is returned. :param GC_weights: result of :func:`get_GC_weights_binwide` for the appropriate binsize and fragments of interest. :return: A list of length `(2*max(fragments)+binsize)`, the weight corresponding to the first position within the bin itself is at index `max(fragments)`. Values represent influence of a dinucleotide starting at a given position on the bin's coverage. """ return [(GC_weights[i] + GC_weights[i + 1]) / 2 for i in range(len(GC_weights) - 1)] + [0]
[docs]def get_trinuc_weights_binwide(GC_weights: List[float]) -> List[float]: """ For a given binsize, calculate the influence of a given trinucleotide starting at a given position on the bin's coverage. This simply averages the GC weight of the position of interest and the following 2 positions. For the last two positions, a weight of 0 is returned. :param GC_weights: result of get_GC_weights_binwide for the binsize and fragments of interest. :return: A list of length `(2*max(fragments)+binsize)`, the weight corresponding to the first position within the bin itself is at index `max(fragments)`. Values represent influence of a trinucleotide starting at a given position on the bin's coverage. """ return [(GC_weights[i] + GC_weights[i + 1] + GC_weights[i + 2]) / 3 for i in range(len(GC_weights) - 2)] + [0,0]
[docs]def get_mapp_weights_binwide(binsize: int, fragments: List[int], dont_slide: bool = False) -> Tuple[List[float]]: """ For a given binsize, calculate the influence a fragment starting/ending at a given position on the bin's coverage :param binsize: Size of the bin :param fragments: A list of fragment lengths that is representative of the samples global fragment length distribution :param dont_slide: Set to False to return weights of 1 in the bin area and 0 outside of it :return: Two lists of length `(2*max(fragments)+binsize)`. Values represent influence a fragment starting/ending at a given position on the bin's coverage. First list is for forward mappability (fragments starting at the position), second list is for reverse mappability (fragments ending at the position) """ longest_frag=max(fragments) weights_fwd={key:0 for key in range(-longest_frag,longest_frag+binsize)} weights_rev={key:0 for key in range(-longest_frag,longest_frag+binsize)} for pos in range(-longest_frag,binsize): first_binpos=0 last_binpos=binsize-1 for fraglen in fragments: if pos <=first_binpos: #overlap of the fragment with the bin weight=max(min(binsize,fraglen-abs(pos)),0) weights_fwd[pos]+=weight weights_rev[pos+fraglen-1]+=weight if pos > first_binpos and pos <= last_binpos: weight=max(min(binsize-abs(pos), fraglen),0) weights_fwd[pos]+=weight weights_rev[pos+fraglen-1]+=weight if not dont_slide: return list(weights_fwd.values()),list(weights_rev.values()) else: return [0 for i in range(longest_frag)]+[1 for i in range(binsize)]+[0 for i in range(longest_frag)], \ [0 for i in range(longest_frag)]+[1 for i in range(binsize)]+[0 for i in range(longest_frag)]
[docs]def get_nucleotide_dicts() -> Tuple[Dict[str,int]]: """ Prepare dictionaries with di-and trinucleotides, as well as a dictionary for forward/reverse complement translation. :return: Three dictionaries: All dinucleotides excluding reverse complements, all trinucleotides excluding reverse complements, and a dictonary that allows translation of reverse complements to their forward complement counterparts (in this order). Keys are all 0. """ # dictionary of all di- and trinucleotides: dinucdict={} trinucdict={} nucdict={} nucleotides=["A","T","G","C"] for nuc1 in nucleotides: for nuc2 in nucleotides: dinuc=nuc1+nuc2 dinuc_seq=Seq.Seq(dinuc) if not str(dinuc_seq.reverse_complement()) in dinucdict.keys(): dinucdict[dinuc]=0 nucdict[dinuc]=0 for nuc1 in nucleotides: for nuc2 in nucleotides: for nuc3 in nucleotides: trinuc=nuc1+nuc2+nuc3 trinuc_seq=Seq.Seq(trinuc) if not str(trinuc_seq.reverse_complement()) in trinucdict.keys(): trinucdict[trinuc]=0 nucdict[trinuc]=0 revcompdict={key:str(Seq.Seq(key).reverse_complement()) for key in list(nucdict.keys())} return dinucdict,trinucdict,revcompdict