Source code for phe.utils

from collections import OrderedDict
import logging
from math import floor
import math
import multiprocessing
import os
from pprint import pprint
import sys

from bintrees import FastRBTree
import vcf

HAVE_SCIPY = True
try:
    from scipy.stats import binom_test
except ImportError:
    HAVE_SCIPY = False

HAVE_PSUTIL = True
try:
    from psutil import virtual_memory
except ImportError:
    HAVE_PSUTIL = False

dValChars = {'A': 1, 'C': 1, 'G': 1, 'T': 1}

# --------------------------------------------------------------------------------------------------

# G -> A = transition = (1, 0)
# A -> T = transversion = (0, 1)
dK80 = {'A': {'A': [0.0, 0.0], 'C': [0.0, 1.0], 'G': [1.0, 0.0], 'T': [0.0, 1.0]},
        'C': {'A': [0.0, 1.0], 'C': [0.0, 0.0], 'G': [0.0, 1.0], 'T': [1.0, 0.0]},
        'G': {'A': [1.0, 0.0], 'C': [0.0, 1.0], 'G': [0.0, 0.0], 'T': [0.0, 1.0]},
        'T': {'A': [0.0, 1.0], 'C': [1.0, 0.0], 'G': [0.0, 1.0], 'T': [0.0, 0.0]}}

# --------------------------------------------------------------------------------------------------

[docs]class BaseStats(object): def __init__(self): self.N = 0 self.mut = 0 self.gap = 0 self.mix = 0 self.total = 0 def __str__(self): return "N: %i, mut: %i, mix: %i, gap: %i, total: %i" % (self.N, self.mut, self.mix, self.gap, self.total) def __add__(self, other): self.mut += other.mut self.N += other.N self.gap += other.gap self.mix += other.mix self.total += other.total self.NA += other.NA return self
[docs] def update(self, position_data, sample, reference): for k, v in position_data.iteritems(): if k != sample: continue if v == "-": self.gap += 1 elif v == "N": self.N += 1 elif v in ["A", "C", "G", "T"]: self.mut += 1 else: self.mix += 1 self.total += 1
# --------------------------------------------------------------------------------------------------
[docs]def is_uncallable(record): """Is the Record uncallable? Currently the record is **uncallable** iff: * GT field is **./.** * **LowQual** is in the filter. Returns ------- uncall: bool True if any of the above items are true, False otherwise. """ uncall = False try: if record.samples[0].data.GT in ("./.", None): uncall = True except AttributeError: uncall = None if record.FILTER is not None and "LowQual" in record.FILTER: uncall = True return uncall
# --------------------------------------------------------------------------------------------------
[docs]def precompute_snp_densities(avail_pos, sample_names, args): """ Precompute the number of differences around each difference between each pair of samples Parameters ---------- avail_pos: dict data structure that contains the information on all available positions, like this: {'gi|194097589|ref|NC_011035.1|': FastRBTree({2329: {'stats': <vcf2distancematrix.BaseStats object at 0x40fb590>, 'reference': 'A', '211700_H15498026501': 'C', '211701_H15510030401': 'C', '211702_H15522021601': 'C'}, 3837: {'211700_H15498026501': 'G', 'stats': <vcf2distancematrix.BaseStats object at 0x40fbf90>, '211701_H15510030401': 'G', 'reference': 'T', '211702_H15522021601': 'G'}, 4140: {'211700_H15498026501': 'A', 'stats': <vcf2distancematrix.BaseStats object at 0x40fb790>, '211701_H15510030401': 'A', 'reference': 'G', '211702_H15522021601': 'A'}})} sample_names: list list of sample names args: dict input parameter dictionary as created by get_args() Returns ------- dDen: dict contains the differences between a pair in a window of given size around each difference of the pair {'diffs': {'187534_H153520399-1': {'187534_H153520399-1': 0, '187536_H154060132-1': 1609, '189918_H154320283-2': 295, '205683_H15352039901': 0, '211698_H15464036401': 298, '211700_H15498026501': 298, '211701_H15510030401': 1621, '211702_H15522021601': 297, '211703_H15534021301': 1632, 'reference': 4045}, '187536_H154060132-1': {'187536_H154060132-1': 0, '205683_H15352039901': 1605, '211698_H15464036401': 1353, '211701_H15510030401': 1, '211702_H15522021601': 1351, '211703_H15534021301': 7, 'reference': 5041} ..., }, 'gi|194097589|ref|NC_011035.1|': {'187534_H153520399-1': {'187536_H154060132-1': {55959: 1, 56617: 1, 157165: 1, 279950: 3, 279957: 3, 279959: 3, 608494: 22, 608537: 23, 608551: 23, 608604: 23, 608617: 24, ...,} '189918_H154320283-2': {27696: 1, 55959: 1, 56617: 2, 56695: 2, 279950: 3, 279957: 3, 279959: 3, 520610: 1, 608494: 22, ..., }}}} """ (_, flGenLen) = get_ref_freqs(args['refgenome'], len_only=True) dDen = {} dDen['diffs'] = {} for i, sample_1 in enumerate(sample_names): dDen['diffs'][sample_1] = {} for j, sample_2 in enumerate(sample_names): if j <= i: dDen['diffs'][sample_1][sample_2] = 0 pool = multiprocessing.Pool(args['threads']) parameters = [] for contig, oBT in avail_pos.items(): dDen[contig] = {} for i, sample_1 in enumerate(sample_names): dDen[contig][sample_1] = {} for j, sample_2 in enumerate(sample_names): if j < i: parameters.append((sample_1, sample_2, oBT, flGenLen,)) results = pool.map(_get_sample_pair_densities, parameters) # Close the pool and wait for all tasks to be completed. pool.close() pool.join() for contig, oBT in avail_pos.items(): for i, sample_1 in enumerate(sample_names): for j, sample_2 in enumerate(sample_names): if j < i: (x, y) = results.pop(0) dDen['diffs'][sample_1][sample_2] += x dDen[contig][sample_1][sample_2] = y # debug # sOutBase = os.path.splitext(args['out'])[0] # with open('%s_dDen.txt' % (sOutBase), 'w') as f: # pprint(dDen, f) return dDen
# -------------------------------------------------------------------------------------------------- def _get_sample_pair_densities(ARGS): """ This wrapper is required to call a funtion with pool.map that accepts >1 parameter. (see http://stackoverflow.com/questions/5442910/python-multiprocessing-pool-map-for-multiple-arguments answers by J.F. Sebastian and imotai) Parameters ---------- ARGS: tuple tuple with parameters Returns ------- call to get_sample_pair_densities with parameters unpacked """ return get_sample_pair_densities(*ARGS) # --------------------------------------------------------------------------------------------------
[docs]def get_sample_pair_densities(sample_1, sample_2, oBT, flGenLen): ''' Function to calculate the differecnes in a window of a given size around is difference for a given pair Parameters ---------- sample_1: str name of sample 1 sample_2: str name of sample 2 oBT: obj bintree object that contains all information for all available positions for a given contig flGenLen: float reference genome length Returns ------- (diffs, d): tuple diffs: int total number of differences between the pair d: dict dict with position of difference as key and number of differences in window around it as value ''' # first find out the total number of diff between the two samples diffs = 0 for pos in oBT: ref_base = oBT[pos].get("reference") s1_base = oBT[pos].get(sample_1, ref_base) # consider only differences between valid characters -> pairwise deletion if dValChars.get(s1_base.upper(), None) == None: continue s2_base = oBT[pos].get(sample_2, ref_base) # consider only differences between valid characters -> pairwise deletion if dValChars.get(s2_base.upper(), None) == None: continue if s1_base != s2_base: diffs += 1 d = {} # use win size of avg nof nucleotides per difference, i.e. each window should have one diff on average # for very similar sample pairs (very few diffs) do not impose max win size # if >1% of genome is different use win size 100 try: iWinsize = max([int(round(flGenLen/diffs)), 100]) except ZeroDivisionError: return (diffs, d) for pos in oBT: ref_base = oBT[pos].get("reference") s1_base = oBT[pos].get(sample_1, ref_base) # consider only differences between valid characters -> pairwise deletion if dValChars.get(s1_base.upper(), None) == None: continue s2_base = oBT[pos].get(sample_2, ref_base) # consider only differences between valid characters -> pairwise deletion if dValChars.get(s2_base.upper(), None) == None: continue if s1_base != s2_base: iDiffsInWin = 0 iWinStart = max(0, pos - ((iWinsize / 2) - 1)) iWinStop = min(int(flGenLen), (pos + (iWinsize / 2) + 1)) for x in range(iWinStart, iWinStop): try: winbase_1 = 'ref' if sample_1 == 'reference' else oBT[x].get(sample_1, 'ref') winbase_2 = 'ref' if sample_2 == 'reference' else oBT[x].get(sample_2, 'ref') except KeyError: # x is a point in the alignment where everything is ref continue # now x is a point in the alignment where there is at least one SNP but not necessarity in s1 or s2 if winbase_1 != 'ref': # if not ref check for valid char if dValChars.get(winbase_1.upper(), None) == None: continue if winbase_2 != 'ref': # if not ref check for valid char if dValChars.get(winbase_2.upper(), None) == None: continue if winbase_1 != winbase_2: iDiffsInWin += 1 d[pos] = iDiffsInWin return (diffs, d)
# --------------------------------------------------------------------------------------------------
[docs]def parse_wg_alignment(dArgs, avail_pos, aSampleNames): ''' Parse alignment to data structure Parameters ---------- dArgs: dict input parameter dictionary as created by get_args() avail_pos: dict dict of bintrees for each contig aSampleNames: list list of sample names Returns ------- 0 also writes all data to avail_pos ''' """ avail_pos looks like this: {'gi|194097589|ref|NC_011035.1|': FastRBTree({2329: {'stats': <vcf2distancematrix.base_stats object at 0x40fb590>, 'reference': 'A', '211700_H15498026501': 'C', '211701_H15510030401': 'C', '211702_H15522021601': 'C'}, 3837: {'211700_H15498026501': 'G', 'stats': <vcf2distancematrix.base_stats object at 0x40fbf90>, '211701_H15510030401': 'G', 'reference': 'T', '211702_H15522021601': 'G'}, 4140: {'211700_H15498026501': 'A', 'stats': <vcf2distancematrix.base_stats object at 0x40fb790>, '211701_H15510030401': 'A', 'reference': 'G', '211702_H15522021601': 'A'}})} """ if os.path.exists(dArgs['alignment_input']) == False: logging.error("Input alignment file not found.") return 1 dSeqs = {} with open(dArgs['alignment_input'], 'r') as algn: # parse the fa file sSeq = "" sName = "" for sLine in algn: sLine = sLine.strip() if sLine.startswith(">"): if len(sSeq) > 0: dSeqs[sName] = sSeq.upper() sSeq = "" sName = sLine[1:].split(' ')[0] continue sSeq = sSeq + sLine dSeqs[sName] = sSeq.upper() sRefName = 'reference' if dSeqs.has_key(sRefName) == False: if dArgs['refgenomename'] is not None and dSeqs.has_key(dArgs['refgenomename']) == True: sRefName = dArgs['refgenomename'] else: logging.error("No seq named 'reference' in your alignment AND alternative name not found or not given either.") return 1 else: logging.info("Your reference appears to be named 'reference'") for sn in dSeqs.keys(): aSampleNames.append(sn) all_seq_len = [len(x) for x in dSeqs.values()] try: assert len(all_seq_len) == all_seq_len.count(all_seq_len[0]) except AssertionError: logging.error("Not all seqs in your alignment have the same length") return 1 avail_pos['alignment_contig'] = FastRBTree() iSeqLen = all_seq_len[0] for i in range(0, iSeqLen): d = {} d['reference'] = dSeqs[sRefName][i] for sn in aSampleNames: if sn != sRefName and dSeqs[sn][i] != d['reference']: d[sn] = dSeqs[sn][i] # are the nt at this pos all the same? if len(d.values()) == d.values().count(d.values()[0]): continue oBS = BaseStats() for sn in aSampleNames: oBS.update(d, sn, '_') d['stats'] = oBS avail_pos['alignment_contig'][i+1] = d return 0
# --------------------------------------------------------------------------------------------------
[docs]def parse_vcf_files(dArgs, avail_pos, aSampleNames): ''' Parse vcf files to data structure Parameters ---------- dArgs: dict input parameter dictionary as created by get_args() avail_pos: dict dict of bintrees for each contig aSampleNames: list list of sample names Returns ------- 0 also writes all data to avail_pos ''' empty_tree = FastRBTree() exclude = {} include = {} if dArgs["exclude"] != None or dArgs["include"] != None: pos = {} chr_pos = [] bed_file = dArgs["include"] if dArgs["include"] != None else dArgs["exclude"] with open(bed_file) as fp: for line in fp: data = line.strip().split("\t") chr_pos += [(i, False,) for i in xrange(int(data[1]), int(data[2]) + 1)] try: pos[data[0]] += chr_pos except KeyError: pos[data[0]] = chr_pos pos = {chrom: FastRBTree(l) for chrom, l in pos.items()} if dArgs["include"]: include = pos else: exclude = pos dSamNms = {'reference': None} # First pass to get the references and the positions to be analysed. for vcf_in in dArgs['input']: reader = vcf.Reader(filename=vcf_in) # Get the sample name from the VCF file (usually the read group). sample_name = reader.samples[0] assert dSamNms.has_key(sample_name) == False, "ERROR: %s is not a unique sample name in the set of vcfs" % (sample_name) dSamNms[sample_name] = None # Go over every position in the reader. for record in reader: # Inject a property about uncallable genotypes into record. record.__setattr__("is_uncallable", is_uncallable(record)) # is_uncallable = types.MethodType(is_uncallable, record) # SKIP indels, if not handled then can cause REF base to be >1 if record.is_indel and not (record.is_uncallable or record.is_monomorphic) or len(record.REF) > 1: # if len(record.REF) > 1: # print "%s\t%s\t%s\t%s\t%s" % (sample_name,record.POS,-1,record.FILTER,record) continue # if record.is_deletion and not record.is_uncallable: # continue # SKIP (or include) any pre-specified regions. if include and record.POS not in include.get(record.CHROM, empty_tree) or exclude and record.POS in exclude.get(record.CHROM, empty_tree): # if include.get(record.CHROM, empty_tree).get(record.POS, False) or \ # not exclude.get(record.CHROM, empty_tree).get(record.POS, True): continue try: _ = avail_pos[record.CHROM] except KeyError: avail_pos[record.CHROM] = FastRBTree() # Setup the position data to contain reference and stats. if avail_pos[record.CHROM].get(record.POS, None) is None: avail_pos[record.CHROM].insert(record.POS, {"reference": str(record.REF), "stats": BaseStats()}) position_data = avail_pos[record.CHROM].get(record.POS) assert len(position_data["reference"]) == 1, "Reference base must be singluar: in %s found %s @ %s" % (position_data["reference"], sample_name, record.POS) # IF this is uncallable genotype, add gap "-" if record.is_uncallable: # TODO: Mentioned in issue: #7(gitlab) position_data[sample_name] = "-" # Update stats position_data["stats"].gap += 1 elif not record.FILTER: # If filter PASSED! # Make sure the reference base is the same. Maybe a vcf from different species snuck in here?! assert str(record.REF) == position_data["reference"] or str(record.REF) == 'N' or position_data["reference"] == 'N', "SOMETHING IS REALLY WRONG because reference for the same position is DIFFERENT! %s in %s (%s, %s)" % (record.POS, vcf_in, str(record.REF), position_data["reference"]) # update position_data['reference'] to a real base if possible if position_data['reference'] == 'N' and str(record.REF) != 'N': position_data['reference'] = str(record.REF).upper() if record.is_snp: if len(record.ALT) > 1: logging.info("POS %s passed filters but has multiple alleles REF: %s, ALT: %s. Inserting N", record.POS, str(record.REF), str(record.ALT)) position_data[sample_name] = "N" position_data["stats"].N += 1 else: position_data[sample_name] = str(record.ALT[0]).upper() position_data["stats"].mut += 1 # Filter(s) failed elif record.is_snp: position_data[sample_name] = 'N' position_data["stats"].N += 1 else: # filter fail; code as N for consistency position_data[sample_name] = "N" position_data["stats"].N += 1 # finished parsing for sn in dSamNms.keys(): aSampleNames.append(sn) return 0
# --------------------------------------------------------------------------------------------------
[docs]def get_dist_mat(aSampleNames, avail_pos, dArgs): """ Calculates the distance matrix, optionally removes recombination from it and optionally normalises it Parameters ---------- aSampleNames: list list of sample names avail_pos: dict infomatin on all available positions {'gi|194097589|ref|NC_011035.1|': FastRBTree({2329: {'stats': <vcf2distancematrix.BaseStats object at 0x40fb590>, 'reference': 'A', '211700_H15498026501': 'C', '211701_H15510030401': 'C', '211702_H15522021601': 'C'}, 3837: {'211700_H15498026501': 'G', 'stats': <vcf2distancematrix.BaseStats object at 0x40fbf90>, '211701_H15510030401': 'G', 'reference': 'T', '211702_H15522021601': 'G'}, 4140: {'211700_H15498026501': 'A', 'stats': <vcf2distancematrix.BaseStats object at 0x40fb790>, '211701_H15510030401': 'A', 'reference': 'G', '211702_H15522021601': 'A'}})} dArgs: dict input parameter dictionary as created by get_args() Returns ------- call to get_sample_pair_densities with parameters unpacked """ dDen = None dRemovals = None if dArgs['remove_recombination'] == True: if HAVE_SCIPY == False: logging.error("Cannot import scipy requied for recombination removal.") return None dDen = precompute_snp_densities(avail_pos, aSampleNames, dArgs) dRemovals = {} for i, sample_1 in enumerate(aSampleNames): dRemovals[sample_1] = {} for j, sample_2 in enumerate(aSampleNames): if j <= i: dRemovals[sample_1][sample_2] = 0 # initialise empty matrix dist_mat = {} for i, sample_1 in enumerate(aSampleNames): dist_mat[sample_1] = {} for j, sample_2 in enumerate(aSampleNames): if j <= i: if dArgs['substitution'] == 'k80' or dArgs['substitution'] == 't93': dist_mat[sample_1][sample_2] = [0.0, 0.0] elif dArgs['substitution'] == 'tn84': dist_mat[sample_1][sample_2] = {'A': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'T': 0.0}, 'C': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'T': 0.0}, 'G': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'T': 0.0}, 'T': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'T': 0.0}} else: dist_mat[sample_1][sample_2] = 0.0 else: # j > i pass flGenLen = 0.0 if dDen != None: (_, flGenLen) = get_ref_freqs(dArgs['refgenome'], len_only=True) aStats = [] for sContig in avail_pos.keys(): for pos in avail_pos[sContig]: ref_base = avail_pos[sContig][pos].get("reference") for i, sample_1 in enumerate(aSampleNames): s1_base = avail_pos[sContig][pos].get(sample_1, ref_base) # consider only differences between valid characters -> pairwise deletion if dValChars.get(s1_base.upper(), None) == None: continue for j, sample_2 in enumerate(aSampleNames): if j < i: s2_base = avail_pos[sContig][pos].get(sample_2, ref_base) # consider only differences between valid characters -> pairwise deletion if dValChars.get(s2_base.upper(), None) == None: continue # Recombination removal happens here if dDen != None and s1_base != s2_base: iDiffsInWin = dDen[sContig][sample_1][sample_2][pos] iTotalDiffs = dDen['diffs'][sample_1][sample_2] flNofWins = flGenLen / max([int(round(flGenLen/iTotalDiffs)), 100]) p_hitting_window = 1.0 / flNofWins p_ok = 1.0 # only do binomial test if there are 'too many differences' # => do not exclude diffs because there are 'not enough' if iDiffsInWin > 1 and iDiffsInWin > iTotalDiffs / float(flNofWins): # binomial test: # what is the probability that I have x successes (i.e. diffs in the current window) # given n trials (i.e. the total number of differences between the two samples) # and given that the probabilty of success is 1/total_num_windows p_ok = binom_test(iDiffsInWin, iTotalDiffs, p_hitting_window) corr_p_thresh = (0.05 / iTotalDiffs) aStats.append("%s\t%s\t%i\t%i\t%i\t%e\t%e\n" % (sample_1, sample_2, pos, iDiffsInWin, iTotalDiffs, p_ok, corr_p_thresh)) # null-hypothesis: probability of hitting it is equal for all windows, i.e. # the diffs between the samples are uniformly distributed # Bonferroni corrected p-value threshold: (0.05 / iTotalDiffs) if p_ok <= corr_p_thresh: # likely to be recombinat site (for a given definition of 'likely' and 'recombinant') # aStats.append("%s\t%s\t%i\t%i\t%i\t%e\n" % (sample_1, sample_2, pos, iDiffsInWin, iTotalDiffs, p_ok)) dRemovals[sample_1][sample_2] += 1 continue if dArgs['substitution'] == 'k80' or dArgs['substitution'] == 't93': k = get_difference_value(s1_base.upper(), s2_base.upper(), dArgs['substitution']) dist_mat[sample_1][sample_2][0] += k[0] dist_mat[sample_1][sample_2][1] += k[1] elif dArgs['substitution'] == 'tn84': if s1_base > s2_base: # don't do this: # s1_base, s2_base = s2_base, s1_base # dist_mat[sample_1][sample_2][s1_base][s2_base] += 1.0 # => messes up the references dist_mat[sample_1][sample_2][s2_base][s1_base] += 1.0 else: dist_mat[sample_1][sample_2][s1_base][s2_base] += 1.0 elif dArgs['substitution'] == 'number_of_differences' or dArgs['substitution'] == 'jc69': k = get_difference_value(s1_base.upper(), s2_base.upper(), dArgs['substitution']) dist_mat[sample_1][sample_2] += k else: raise NotImplementedError else: # j >= i pass # write additional stats if required if dArgs['remove_recombination'] == True and dArgs['with_stats'] == True: sOutBase = os.path.splitext(dArgs['out'])[0] with open("%s.removals.tsv" % (sOutBase), 'w') as fOut: for i, sample_1 in enumerate(aSampleNames): row = sample_1 for j, sample_2 in enumerate(aSampleNames): if j < i: row += "%s%i" % ('\t', dRemovals[sample_1][sample_2]) fOut.write("%s\n" % row) with open("%s.proportion_removed.tsv" % (sOutBase), 'w') as fOut: for i, sample_1 in enumerate(aSampleNames): row = sample_1 for j, sample_2 in enumerate(aSampleNames): if j < i: try: row += "%s%f" % ('\t', dRemovals[sample_1][sample_2] \ / (dist_mat[sample_1][sample_2] \ + dRemovals[sample_1][sample_2])) except ZeroDivisionError: row += "\tNAN" fOut.write("%s\n" % row) with open("%s.all_snps.tsv" % (sOutBase), 'w') as fOut: fOut.write("sample_1\tsample_2\tposition\tdiffs_in_win\ttotal_diffs\tp_ok\tthreshold\n") for sLine in aStats: fOut.write(sLine) # 'normalise' distance matrix according to model requested if dArgs['substitution'] == 'jc69': dist_mat = normalise_jc69(dist_mat, dArgs['refgenome'], aSampleNames) elif dArgs['substitution'] == 'k80': dist_mat = normalise_k80(dist_mat, dArgs['refgenome'], aSampleNames) elif dArgs['substitution'] == 'tn84': dist_mat = normalise_tn84(dist_mat, dArgs['refgenome'], aSampleNames) elif dArgs['substitution'] == 't93': dist_mat = normalise_t93(dist_mat, dArgs['refgenome'], aSampleNames) elif dArgs['substitution'] == 'number_of_differences': pass else: raise NotImplementedError return dist_mat
# --------------------------------------------------------------------------------------------------
[docs]def normalise_t93(d, ref, names): ''' Normalise distance matrix according to the Tamura 3-parameter distance model see: Nei and Zhang: Evolutionary Distance: Estimation, ENCYCLOPEDIA OF LIFE SCIENCES 2005, doi: 10.1038/npg.els.0005108 http://www.umich.edu/~zhanglab/publications/2003/a0005108.pdf, equation 16 Parameters ---------- d: dict distance matrix ref: str reference genome file name names: list list of sample names Returns ------- d: dict normalised matrix ''' (dRefFreq, flGenLen) = get_ref_freqs(ref) flGC = dRefFreq['C'] + dRefFreq['G'] h = 2.0 * flGC * (1.0 - flGC) for i, sample_1 in enumerate(names): for j, sample_2 in enumerate(names): if j < i: Q = d[sample_1][sample_2][1] / flGenLen p = sum(d[sample_1][sample_2]) / flGenLen x1 = -h * math.log(1.0 - p / h - Q) x2 = 0.5 * (1.0 - h) * math.log(1.0 - (2.0 * Q)) d[sample_1][sample_2] = x1 - x2 elif i == j: d[sample_1][sample_2] = 0.0 return d
# --------------------------------------------------------------------------------------------------
[docs]def normalise_tn84(d, ref, names): """ Normalise distance matrix according to the Kimura 2-parameter distance model see: Nei and Zhang: Evolutionary Distance: Estimation, ENCYCLOPEDIA OF LIFE SCIENCES 2005, doi: 10.1038/npg.els.0005108 http://www.umich.edu/~zhanglab/publications/2003/a0005108.pdf, equation 13 and 14 Parameters ---------- d: dict distance matrix d = {'211701_H15510030401': {'211700_H15498026501': {'A': {'A': 1152.0, 'C': 114.0, 'G': 545.0, 'T': 35.0}, 'C': {'A': 0.0, 'C': 1233.0, 'G': 108.0, 'T': 467.0}, 'G': {'A': 0.0, 'C': 0.0, 'G': 1283.0, 'T': 100.0}, 'T': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'T': 1177.0}}, ...}, ...} ref: str reference genome file name names: list list of sample names Returns ------- d: dict normalised matrix """ (dRefFreq, flGenLen) = get_ref_freqs(ref) sum1 = sum([x * x for x in dRefFreq.values()]) for i, sample_1 in enumerate(names): for j, sample_2 in enumerate(names): if j < i: iNofDiff = getTotalNofDiff_tn84(d[sample_1][sample_2]) p = iNofDiff / flGenLen sum2 = 0.0 for m, nuc1 in enumerate(['A', 'C', 'G', 'T']): for n, nuc2 in enumerate(['A', 'C', 'G', 'T']): if m < n: flFreqNucPair = d[sample_1][sample_2][nuc1][nuc2] / flGenLen sum2 += ((flFreqNucPair ** 2.0) / (2.0 * dRefFreq[nuc1] * dRefFreq[nuc2])) if sum2 == 0.0: # samples are "identical" d[sample_1][sample_2] = 0.0 else: b = 0.5 * ((1.0 - sum1) + (p * p / sum2)) d[sample_1][sample_2] = -b * math.log(1.0 - (p / b)) elif i == j: d[sample_1][sample_2] = 0.0 else: # j > i pass return d
# --------------------------------------------------------------------------------------------------
[docs]def normalise_k80(d, ref, names): ''' Normalise distance matrix according to the Tajima-Nei distance model see: Nei and Zhang: Evolutionary Distance: Estimation, ENCYCLOPEDIA OF LIFE SCIENCES 2005, doi: 10.1038/npg.els.0005108 http://www.umich.edu/~zhanglab/publications/2003/a0005108.pdf, equation 9 Parameters ---------- d: dict distance matrix ref: str reference genome file name names: list list of sample names Returns ------- d: dict normalised matrix ''' (_, flGenLen) = get_ref_freqs(ref, len_only=True) for i, sample_1 in enumerate(names): for j, sample_2 in enumerate(names): if j < i: P = d[sample_1][sample_2][0] / flGenLen Q = d[sample_1][sample_2][1] / flGenLen w1 = 1.0 - (2.0 * P) - Q w2 = 1.0 - (2.0 * Q) d[sample_1][sample_2] = (-0.5 * math.log(w1)) - (0.25 * math.log(w2)) elif i == j: d[sample_1][sample_2] = 0.0 return d
# --------------------------------------------------------------------------------------------------
[docs]def normalise_jc69(d, ref, names): ''' Normalise distance matrix according to the Jukes-Cantor distance model see: Nei and Zhang: Evolutionary Distance: Estimation, ENCYCLOPEDIA OF LIFE SCIENCES 2005, doi: 10.1038/npg.els.0005108 http://www.umich.edu/~zhanglab/publications/2003/a0005108.pdf, equation 7 Parameters ---------- d: dict distance matrix ref: str reference genome file name names: list list of sample names Returns ------- d: dict normalised matrix ''' flGenLen = 0.0 with open(ref, 'r') as fRef: for sLine in fRef: if sLine.startswith(">"): continue else: flGenLen += len(sLine.strip()) logging.info("genome length: %s" % flGenLen) for i, sample_1 in enumerate(names): for j, sample_2 in enumerate(names): if j < i: p = d[sample_1][sample_2] / flGenLen d[sample_1][sample_2] = (-3.0 / 4.0) * math.log(1.0 - ((4.0 / 3.0) * p)) return d
# --------------------------------------------------------------------------------------------------
[docs]def get_difference_value(s1_base, s2_base, sSubs): ''' Get difference value for a given set of bases. Parameters ---------- s1_base: str a charcater s2_base: str a charcater sSubs: str distance model Returns ------- difference: float or list depending on the distance model either a float 1.0 or 0.0 of a list of two floats ''' if sSubs == 'number_of_differences' or sSubs == 'jc69': try: _ = dValChars[s1_base] except KeyError: return 0.0 try: _ = dValChars[s2_base] except KeyError: return 0.0 if s1_base == s2_base: return 0.0 else: return 1.0 elif sSubs == 'k80' or sSubs == 't93': try: _ = dValChars[s1_base] except KeyError: return [0.0, 0.0] try: _ = dValChars[s2_base] except KeyError: return [0.0, 0.0] return dK80[s1_base][s2_base] else: raise NotImplementedError
# --------------------------------------------------------------------------------------------------
[docs]def getTotalNofDiff_tn84(d): """ Sum up total number of differences for a dict like the one in the input Parameters ---------- d: dict {'A': {'A': 1152.0, 'C': 114.0, 'G': 545.0, 'T': 35.0}, 'C': {'A': 0.0, 'C': 1233.0, 'G': 108.0, 'T': 467.0}, 'G': {'A': 0.0, 'C': 0.0, 'G': 1283.0, 'T': 100.0}, 'T': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'T': 1177.0}} Returns ------- t: float the sum of all differences(1369.0 in above example case) """ t = 0.0 for k, v in d.items(): t += sum([y if x != k else 0.0 for x, y in v.items()]) return t
# --------------------------------------------------------------------------------------------------
[docs]def get_ref_freqs(ref, len_only=False): """ Get the length of the reference genome and optionally the nucleotide frequencies in it Parameters ---------- ref: str reference genome filename len_only: boolean get genome lengths only [default FALSE, also get nucleotide frequencies] Returns ------- (dRefFreq, flGenLen): tuple dRefFreq: dict dRefFreq = {'A': 0.25, 'C': 0.24, 'G': 0.26, 'T': 0.25} flGenLen: float genome length """ # get frequency of a, c, g, and t in ref dRefFreq = {'A': 0.0, 'C': 0.0, 'G': 0.0, 'T': 0.0} flGenLen = 0.0 with open(ref, 'r') as fRef: for sLine in fRef: if sLine.startswith(">"): continue else: sLine = sLine.strip() flGenLen += len(sLine) if len_only == True: continue sLine = sLine.upper() for n in sLine: try: dRefFreq[n] += 1.0 except KeyError: pass if len_only == False: for (i, j) in dRefFreq.items(): dRefFreq[i] = j / flGenLen logging.info("genome length: %s" % flGenLen) return (dRefFreq, flGenLen)
# --------------------------------------------------------------------------------------------------
[docs]def calculate_memory_for_sort(): """Calculate available memory for ``samtools sort`` function. If there is enough memory, no temp files are created. **Enough** is defined as at least 1G per CPU. Returns ------- sort_memory: str or None String to use directly with *-m* option in sort, or None. """ avail_memory = None if HAVE_PSUTIL == True: mem = virtual_memory() avail_memory = mem.total else: try: avail_memory = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES') # e.g. 4015976448 except ValueError: logging.error("If you're in Mac OS you need to have the psutil Python library.") raise SystemExit avail_cpu = multiprocessing.cpu_count() sort_memory = avail_memory / avail_cpu / 1024 ** 2 # samtools default from documentation is 768M, to be conservative # only use -m option when there is more than 1G per CPU. if sort_memory < 1024: sort_memory = None else: sort_memory = "%sG" % (int(floor(sort_memory / 1024))) return sort_memory
# -------------------------------------------------------------------------------------------------- if __name__ == "__main__": print calculate_memory_for_sort()