from collections import OrderedDict
import logging
from math import floor
import math
import multiprocessing
import os
from pprint import pprint
import sys
from bintrees import FastRBTree
import vcf
HAVE_SCIPY = True
try:
from scipy.stats import binom_test
except ImportError:
HAVE_SCIPY = False
HAVE_PSUTIL = True
try:
from psutil import virtual_memory
except ImportError:
HAVE_PSUTIL = False
dValChars = {'A': 1, 'C': 1, 'G': 1, 'T': 1}
# --------------------------------------------------------------------------------------------------
# G -> A = transition = (1, 0)
# A -> T = transversion = (0, 1)
dK80 = {'A': {'A': [0.0, 0.0], 'C': [0.0, 1.0], 'G': [1.0, 0.0], 'T': [0.0, 1.0]},
'C': {'A': [0.0, 1.0], 'C': [0.0, 0.0], 'G': [0.0, 1.0], 'T': [1.0, 0.0]},
'G': {'A': [1.0, 0.0], 'C': [0.0, 1.0], 'G': [0.0, 0.0], 'T': [0.0, 1.0]},
'T': {'A': [0.0, 1.0], 'C': [1.0, 0.0], 'G': [0.0, 1.0], 'T': [0.0, 0.0]}}
# --------------------------------------------------------------------------------------------------
[docs]class BaseStats(object):
def __init__(self):
self.N = 0
self.mut = 0
self.gap = 0
self.mix = 0
self.total = 0
def __str__(self):
return "N: %i, mut: %i, mix: %i, gap: %i, total: %i" % (self.N,
self.mut,
self.mix,
self.gap,
self.total)
def __add__(self, other):
self.mut += other.mut
self.N += other.N
self.gap += other.gap
self.mix += other.mix
self.total += other.total
self.NA += other.NA
return self
[docs] def update(self, position_data, sample, reference):
for k, v in position_data.iteritems():
if k != sample:
continue
if v == "-":
self.gap += 1
elif v == "N":
self.N += 1
elif v in ["A", "C", "G", "T"]:
self.mut += 1
else:
self.mix += 1
self.total += 1
# --------------------------------------------------------------------------------------------------
[docs]def is_uncallable(record):
"""Is the Record uncallable? Currently the record is **uncallable** iff:
* GT field is **./.**
* **LowQual** is in the filter.
Returns
-------
uncall: bool
True if any of the above items are true, False otherwise.
"""
uncall = False
try:
if record.samples[0].data.GT in ("./.", None):
uncall = True
except AttributeError:
uncall = None
if record.FILTER is not None and "LowQual" in record.FILTER:
uncall = True
return uncall
# --------------------------------------------------------------------------------------------------
[docs]def precompute_snp_densities(avail_pos, sample_names, args):
"""
Precompute the number of differences around each difference between each pair of samples
Parameters
----------
avail_pos: dict
data structure that contains the information on all available positions, like this:
{'gi|194097589|ref|NC_011035.1|':
FastRBTree({2329: {'stats': <vcf2distancematrix.BaseStats object at 0x40fb590>,
'reference': 'A',
'211700_H15498026501': 'C',
'211701_H15510030401': 'C',
'211702_H15522021601': 'C'},
3837: {'211700_H15498026501': 'G',
'stats': <vcf2distancematrix.BaseStats object at 0x40fbf90>,
'211701_H15510030401': 'G',
'reference': 'T',
'211702_H15522021601': 'G'},
4140: {'211700_H15498026501': 'A',
'stats': <vcf2distancematrix.BaseStats object at 0x40fb790>,
'211701_H15510030401': 'A',
'reference': 'G',
'211702_H15522021601': 'A'}})}
sample_names: list
list of sample names
args: dict
input parameter dictionary as created by get_args()
Returns
-------
dDen: dict
contains the differences between a pair in a window of given size
around each difference of the pair
{'diffs': {'187534_H153520399-1': {'187534_H153520399-1': 0,
'187536_H154060132-1': 1609,
'189918_H154320283-2': 295,
'205683_H15352039901': 0,
'211698_H15464036401': 298,
'211700_H15498026501': 298,
'211701_H15510030401': 1621,
'211702_H15522021601': 297,
'211703_H15534021301': 1632,
'reference': 4045},
'187536_H154060132-1': {'187536_H154060132-1': 0,
'205683_H15352039901': 1605,
'211698_H15464036401': 1353,
'211701_H15510030401': 1,
'211702_H15522021601': 1351,
'211703_H15534021301': 7,
'reference': 5041}
...,
},
'gi|194097589|ref|NC_011035.1|': {'187534_H153520399-1': {'187536_H154060132-1': {55959: 1,
56617: 1,
157165: 1,
279950: 3,
279957: 3,
279959: 3,
608494: 22,
608537: 23,
608551: 23,
608604: 23,
608617: 24,
...,}
'189918_H154320283-2': {27696: 1,
55959: 1,
56617: 2,
56695: 2,
279950: 3,
279957: 3,
279959: 3,
520610: 1,
608494: 22,
...,
}}}}
"""
(_, flGenLen) = get_ref_freqs(args['refgenome'], len_only=True)
dDen = {}
dDen['diffs'] = {}
for i, sample_1 in enumerate(sample_names):
dDen['diffs'][sample_1] = {}
for j, sample_2 in enumerate(sample_names):
if j <= i:
dDen['diffs'][sample_1][sample_2] = 0
pool = multiprocessing.Pool(args['threads'])
parameters = []
for contig, oBT in avail_pos.items():
dDen[contig] = {}
for i, sample_1 in enumerate(sample_names):
dDen[contig][sample_1] = {}
for j, sample_2 in enumerate(sample_names):
if j < i:
parameters.append((sample_1, sample_2, oBT, flGenLen,))
results = pool.map(_get_sample_pair_densities, parameters)
# Close the pool and wait for all tasks to be completed.
pool.close()
pool.join()
for contig, oBT in avail_pos.items():
for i, sample_1 in enumerate(sample_names):
for j, sample_2 in enumerate(sample_names):
if j < i:
(x, y) = results.pop(0)
dDen['diffs'][sample_1][sample_2] += x
dDen[contig][sample_1][sample_2] = y
# debug
# sOutBase = os.path.splitext(args['out'])[0]
# with open('%s_dDen.txt' % (sOutBase), 'w') as f:
# pprint(dDen, f)
return dDen
# --------------------------------------------------------------------------------------------------
def _get_sample_pair_densities(ARGS):
"""
This wrapper is required to call a funtion with pool.map that accepts >1 parameter.
(see http://stackoverflow.com/questions/5442910/python-multiprocessing-pool-map-for-multiple-arguments
answers by J.F. Sebastian and imotai)
Parameters
----------
ARGS: tuple
tuple with parameters
Returns
-------
call to get_sample_pair_densities with parameters unpacked
"""
return get_sample_pair_densities(*ARGS)
# --------------------------------------------------------------------------------------------------
[docs]def get_sample_pair_densities(sample_1, sample_2, oBT, flGenLen):
'''
Function to calculate the differecnes in a window of a given size around is
difference for a given pair
Parameters
----------
sample_1: str
name of sample 1
sample_2: str
name of sample 2
oBT: obj
bintree object that contains all information for all available
positions for a given contig
flGenLen: float
reference genome length
Returns
-------
(diffs, d): tuple
diffs: int
total number of differences between the pair
d: dict
dict with position of difference as key and number of differences
in window around it as value
'''
# first find out the total number of diff between the two samples
diffs = 0
for pos in oBT:
ref_base = oBT[pos].get("reference")
s1_base = oBT[pos].get(sample_1, ref_base)
# consider only differences between valid characters -> pairwise deletion
if dValChars.get(s1_base.upper(), None) == None:
continue
s2_base = oBT[pos].get(sample_2, ref_base)
# consider only differences between valid characters -> pairwise deletion
if dValChars.get(s2_base.upper(), None) == None:
continue
if s1_base != s2_base:
diffs += 1
d = {}
# use win size of avg nof nucleotides per difference, i.e. each window should have one diff on average
# for very similar sample pairs (very few diffs) do not impose max win size
# if >1% of genome is different use win size 100
try:
iWinsize = max([int(round(flGenLen/diffs)), 100])
except ZeroDivisionError:
return (diffs, d)
for pos in oBT:
ref_base = oBT[pos].get("reference")
s1_base = oBT[pos].get(sample_1, ref_base)
# consider only differences between valid characters -> pairwise deletion
if dValChars.get(s1_base.upper(), None) == None:
continue
s2_base = oBT[pos].get(sample_2, ref_base)
# consider only differences between valid characters -> pairwise deletion
if dValChars.get(s2_base.upper(), None) == None:
continue
if s1_base != s2_base:
iDiffsInWin = 0
iWinStart = max(0, pos - ((iWinsize / 2) - 1))
iWinStop = min(int(flGenLen), (pos + (iWinsize / 2) + 1))
for x in range(iWinStart, iWinStop):
try:
winbase_1 = 'ref' if sample_1 == 'reference' else oBT[x].get(sample_1, 'ref')
winbase_2 = 'ref' if sample_2 == 'reference' else oBT[x].get(sample_2, 'ref')
except KeyError:
# x is a point in the alignment where everything is ref
continue
# now x is a point in the alignment where there is at least one SNP but not necessarity in s1 or s2
if winbase_1 != 'ref':
# if not ref check for valid char
if dValChars.get(winbase_1.upper(), None) == None:
continue
if winbase_2 != 'ref':
# if not ref check for valid char
if dValChars.get(winbase_2.upper(), None) == None:
continue
if winbase_1 != winbase_2:
iDiffsInWin += 1
d[pos] = iDiffsInWin
return (diffs, d)
# --------------------------------------------------------------------------------------------------
[docs]def parse_wg_alignment(dArgs, avail_pos, aSampleNames):
'''
Parse alignment to data structure
Parameters
----------
dArgs: dict
input parameter dictionary as created by get_args()
avail_pos: dict
dict of bintrees for each contig
aSampleNames: list
list of sample names
Returns
-------
0
also writes all data to avail_pos
'''
"""
avail_pos looks like this:
{'gi|194097589|ref|NC_011035.1|':
FastRBTree({2329: {'stats': <vcf2distancematrix.base_stats object at 0x40fb590>,
'reference': 'A',
'211700_H15498026501': 'C',
'211701_H15510030401': 'C',
'211702_H15522021601': 'C'},
3837: {'211700_H15498026501': 'G',
'stats': <vcf2distancematrix.base_stats object at 0x40fbf90>,
'211701_H15510030401': 'G',
'reference': 'T',
'211702_H15522021601': 'G'},
4140: {'211700_H15498026501': 'A',
'stats': <vcf2distancematrix.base_stats object at 0x40fb790>,
'211701_H15510030401': 'A',
'reference': 'G',
'211702_H15522021601': 'A'}})}
"""
if os.path.exists(dArgs['alignment_input']) == False:
logging.error("Input alignment file not found.")
return 1
dSeqs = {}
with open(dArgs['alignment_input'], 'r') as algn:
# parse the fa file
sSeq = ""
sName = ""
for sLine in algn:
sLine = sLine.strip()
if sLine.startswith(">"):
if len(sSeq) > 0:
dSeqs[sName] = sSeq.upper()
sSeq = ""
sName = sLine[1:].split(' ')[0]
continue
sSeq = sSeq + sLine
dSeqs[sName] = sSeq.upper()
sRefName = 'reference'
if dSeqs.has_key(sRefName) == False:
if dArgs['refgenomename'] is not None and dSeqs.has_key(dArgs['refgenomename']) == True:
sRefName = dArgs['refgenomename']
else:
logging.error("No seq named 'reference' in your alignment AND alternative name not found or not given either.")
return 1
else:
logging.info("Your reference appears to be named 'reference'")
for sn in dSeqs.keys():
aSampleNames.append(sn)
all_seq_len = [len(x) for x in dSeqs.values()]
try:
assert len(all_seq_len) == all_seq_len.count(all_seq_len[0])
except AssertionError:
logging.error("Not all seqs in your alignment have the same length")
return 1
avail_pos['alignment_contig'] = FastRBTree()
iSeqLen = all_seq_len[0]
for i in range(0, iSeqLen):
d = {}
d['reference'] = dSeqs[sRefName][i]
for sn in aSampleNames:
if sn != sRefName and dSeqs[sn][i] != d['reference']:
d[sn] = dSeqs[sn][i]
# are the nt at this pos all the same?
if len(d.values()) == d.values().count(d.values()[0]):
continue
oBS = BaseStats()
for sn in aSampleNames:
oBS.update(d, sn, '_')
d['stats'] = oBS
avail_pos['alignment_contig'][i+1] = d
return 0
# --------------------------------------------------------------------------------------------------
[docs]def parse_vcf_files(dArgs, avail_pos, aSampleNames):
'''
Parse vcf files to data structure
Parameters
----------
dArgs: dict
input parameter dictionary as created by get_args()
avail_pos: dict
dict of bintrees for each contig
aSampleNames: list
list of sample names
Returns
-------
0
also writes all data to avail_pos
'''
empty_tree = FastRBTree()
exclude = {}
include = {}
if dArgs["exclude"] != None or dArgs["include"] != None:
pos = {}
chr_pos = []
bed_file = dArgs["include"] if dArgs["include"] != None else dArgs["exclude"]
with open(bed_file) as fp:
for line in fp:
data = line.strip().split("\t")
chr_pos += [(i, False,) for i in xrange(int(data[1]), int(data[2]) + 1)]
try:
pos[data[0]] += chr_pos
except KeyError:
pos[data[0]] = chr_pos
pos = {chrom: FastRBTree(l) for chrom, l in pos.items()}
if dArgs["include"]:
include = pos
else:
exclude = pos
dSamNms = {'reference': None}
# First pass to get the references and the positions to be analysed.
for vcf_in in dArgs['input']:
reader = vcf.Reader(filename=vcf_in)
# Get the sample name from the VCF file (usually the read group).
sample_name = reader.samples[0]
assert dSamNms.has_key(sample_name) == False, "ERROR: %s is not a unique sample name in the set of vcfs" % (sample_name)
dSamNms[sample_name] = None
# Go over every position in the reader.
for record in reader:
# Inject a property about uncallable genotypes into record.
record.__setattr__("is_uncallable", is_uncallable(record)) # is_uncallable = types.MethodType(is_uncallable, record)
# SKIP indels, if not handled then can cause REF base to be >1
if record.is_indel and not (record.is_uncallable or record.is_monomorphic) or len(record.REF) > 1:
# if len(record.REF) > 1:
# print "%s\t%s\t%s\t%s\t%s" % (sample_name,record.POS,-1,record.FILTER,record)
continue
# if record.is_deletion and not record.is_uncallable:
# continue
# SKIP (or include) any pre-specified regions.
if include and record.POS not in include.get(record.CHROM, empty_tree) or exclude and record.POS in exclude.get(record.CHROM, empty_tree):
# if include.get(record.CHROM, empty_tree).get(record.POS, False) or \
# not exclude.get(record.CHROM, empty_tree).get(record.POS, True):
continue
try:
_ = avail_pos[record.CHROM]
except KeyError:
avail_pos[record.CHROM] = FastRBTree()
# Setup the position data to contain reference and stats.
if avail_pos[record.CHROM].get(record.POS, None) is None:
avail_pos[record.CHROM].insert(record.POS, {"reference": str(record.REF),
"stats": BaseStats()})
position_data = avail_pos[record.CHROM].get(record.POS)
assert len(position_data["reference"]) == 1, "Reference base must be singluar: in %s found %s @ %s" % (position_data["reference"], sample_name, record.POS)
# IF this is uncallable genotype, add gap "-"
if record.is_uncallable:
# TODO: Mentioned in issue: #7(gitlab)
position_data[sample_name] = "-"
# Update stats
position_data["stats"].gap += 1
elif not record.FILTER:
# If filter PASSED!
# Make sure the reference base is the same. Maybe a vcf from different species snuck in here?!
assert str(record.REF) == position_data["reference"] or str(record.REF) == 'N' or position_data["reference"] == 'N', "SOMETHING IS REALLY WRONG because reference for the same position is DIFFERENT! %s in %s (%s, %s)" % (record.POS, vcf_in, str(record.REF), position_data["reference"])
# update position_data['reference'] to a real base if possible
if position_data['reference'] == 'N' and str(record.REF) != 'N':
position_data['reference'] = str(record.REF).upper()
if record.is_snp:
if len(record.ALT) > 1:
logging.info("POS %s passed filters but has multiple alleles REF: %s, ALT: %s. Inserting N", record.POS, str(record.REF), str(record.ALT))
position_data[sample_name] = "N"
position_data["stats"].N += 1
else:
position_data[sample_name] = str(record.ALT[0]).upper()
position_data["stats"].mut += 1
# Filter(s) failed
elif record.is_snp:
position_data[sample_name] = 'N'
position_data["stats"].N += 1
else:
# filter fail; code as N for consistency
position_data[sample_name] = "N"
position_data["stats"].N += 1
# finished parsing
for sn in dSamNms.keys():
aSampleNames.append(sn)
return 0
# --------------------------------------------------------------------------------------------------
[docs]def get_dist_mat(aSampleNames, avail_pos, dArgs):
"""
Calculates the distance matrix, optionally removes recombination
from it and optionally normalises it
Parameters
----------
aSampleNames: list
list of sample names
avail_pos: dict
infomatin on all available positions
{'gi|194097589|ref|NC_011035.1|':
FastRBTree({2329: {'stats': <vcf2distancematrix.BaseStats object at 0x40fb590>,
'reference': 'A',
'211700_H15498026501': 'C',
'211701_H15510030401': 'C',
'211702_H15522021601': 'C'},
3837: {'211700_H15498026501': 'G',
'stats': <vcf2distancematrix.BaseStats object at 0x40fbf90>,
'211701_H15510030401': 'G',
'reference': 'T',
'211702_H15522021601': 'G'},
4140: {'211700_H15498026501': 'A',
'stats': <vcf2distancematrix.BaseStats object at 0x40fb790>,
'211701_H15510030401': 'A',
'reference': 'G',
'211702_H15522021601': 'A'}})}
dArgs: dict
input parameter dictionary as created by get_args()
Returns
-------
call to get_sample_pair_densities with parameters unpacked
"""
dDen = None
dRemovals = None
if dArgs['remove_recombination'] == True:
if HAVE_SCIPY == False:
logging.error("Cannot import scipy requied for recombination removal.")
return None
dDen = precompute_snp_densities(avail_pos, aSampleNames, dArgs)
dRemovals = {}
for i, sample_1 in enumerate(aSampleNames):
dRemovals[sample_1] = {}
for j, sample_2 in enumerate(aSampleNames):
if j <= i:
dRemovals[sample_1][sample_2] = 0
# initialise empty matrix
dist_mat = {}
for i, sample_1 in enumerate(aSampleNames):
dist_mat[sample_1] = {}
for j, sample_2 in enumerate(aSampleNames):
if j <= i:
if dArgs['substitution'] == 'k80' or dArgs['substitution'] == 't93':
dist_mat[sample_1][sample_2] = [0.0, 0.0]
elif dArgs['substitution'] == 'tn84':
dist_mat[sample_1][sample_2] = {'A': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'T': 0.0},
'C': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'T': 0.0},
'G': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'T': 0.0},
'T': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'T': 0.0}}
else:
dist_mat[sample_1][sample_2] = 0.0
else: # j > i
pass
flGenLen = 0.0
if dDen != None:
(_, flGenLen) = get_ref_freqs(dArgs['refgenome'], len_only=True)
aStats = []
for sContig in avail_pos.keys():
for pos in avail_pos[sContig]:
ref_base = avail_pos[sContig][pos].get("reference")
for i, sample_1 in enumerate(aSampleNames):
s1_base = avail_pos[sContig][pos].get(sample_1, ref_base)
# consider only differences between valid characters -> pairwise deletion
if dValChars.get(s1_base.upper(), None) == None:
continue
for j, sample_2 in enumerate(aSampleNames):
if j < i:
s2_base = avail_pos[sContig][pos].get(sample_2, ref_base)
# consider only differences between valid characters -> pairwise deletion
if dValChars.get(s2_base.upper(), None) == None:
continue
# Recombination removal happens here
if dDen != None and s1_base != s2_base:
iDiffsInWin = dDen[sContig][sample_1][sample_2][pos]
iTotalDiffs = dDen['diffs'][sample_1][sample_2]
flNofWins = flGenLen / max([int(round(flGenLen/iTotalDiffs)), 100])
p_hitting_window = 1.0 / flNofWins
p_ok = 1.0
# only do binomial test if there are 'too many differences'
# => do not exclude diffs because there are 'not enough'
if iDiffsInWin > 1 and iDiffsInWin > iTotalDiffs / float(flNofWins):
# binomial test:
# what is the probability that I have x successes (i.e. diffs in the current window)
# given n trials (i.e. the total number of differences between the two samples)
# and given that the probabilty of success is 1/total_num_windows
p_ok = binom_test(iDiffsInWin, iTotalDiffs, p_hitting_window)
corr_p_thresh = (0.05 / iTotalDiffs)
aStats.append("%s\t%s\t%i\t%i\t%i\t%e\t%e\n" % (sample_1,
sample_2,
pos,
iDiffsInWin,
iTotalDiffs,
p_ok,
corr_p_thresh))
# null-hypothesis: probability of hitting it is equal for all windows, i.e.
# the diffs between the samples are uniformly distributed
# Bonferroni corrected p-value threshold: (0.05 / iTotalDiffs)
if p_ok <= corr_p_thresh:
# likely to be recombinat site (for a given definition of 'likely' and 'recombinant')
# aStats.append("%s\t%s\t%i\t%i\t%i\t%e\n" % (sample_1, sample_2, pos, iDiffsInWin, iTotalDiffs, p_ok))
dRemovals[sample_1][sample_2] += 1
continue
if dArgs['substitution'] == 'k80' or dArgs['substitution'] == 't93':
k = get_difference_value(s1_base.upper(), s2_base.upper(), dArgs['substitution'])
dist_mat[sample_1][sample_2][0] += k[0]
dist_mat[sample_1][sample_2][1] += k[1]
elif dArgs['substitution'] == 'tn84':
if s1_base > s2_base:
# don't do this:
# s1_base, s2_base = s2_base, s1_base
# dist_mat[sample_1][sample_2][s1_base][s2_base] += 1.0
# => messes up the references
dist_mat[sample_1][sample_2][s2_base][s1_base] += 1.0
else:
dist_mat[sample_1][sample_2][s1_base][s2_base] += 1.0
elif dArgs['substitution'] == 'number_of_differences' or dArgs['substitution'] == 'jc69':
k = get_difference_value(s1_base.upper(), s2_base.upper(), dArgs['substitution'])
dist_mat[sample_1][sample_2] += k
else:
raise NotImplementedError
else: # j >= i
pass
# write additional stats if required
if dArgs['remove_recombination'] == True and dArgs['with_stats'] == True:
sOutBase = os.path.splitext(dArgs['out'])[0]
with open("%s.removals.tsv" % (sOutBase), 'w') as fOut:
for i, sample_1 in enumerate(aSampleNames):
row = sample_1
for j, sample_2 in enumerate(aSampleNames):
if j < i:
row += "%s%i" % ('\t', dRemovals[sample_1][sample_2])
fOut.write("%s\n" % row)
with open("%s.proportion_removed.tsv" % (sOutBase), 'w') as fOut:
for i, sample_1 in enumerate(aSampleNames):
row = sample_1
for j, sample_2 in enumerate(aSampleNames):
if j < i:
try:
row += "%s%f" % ('\t', dRemovals[sample_1][sample_2] \
/ (dist_mat[sample_1][sample_2] \
+ dRemovals[sample_1][sample_2]))
except ZeroDivisionError:
row += "\tNAN"
fOut.write("%s\n" % row)
with open("%s.all_snps.tsv" % (sOutBase), 'w') as fOut:
fOut.write("sample_1\tsample_2\tposition\tdiffs_in_win\ttotal_diffs\tp_ok\tthreshold\n")
for sLine in aStats:
fOut.write(sLine)
# 'normalise' distance matrix according to model requested
if dArgs['substitution'] == 'jc69':
dist_mat = normalise_jc69(dist_mat, dArgs['refgenome'], aSampleNames)
elif dArgs['substitution'] == 'k80':
dist_mat = normalise_k80(dist_mat, dArgs['refgenome'], aSampleNames)
elif dArgs['substitution'] == 'tn84':
dist_mat = normalise_tn84(dist_mat, dArgs['refgenome'], aSampleNames)
elif dArgs['substitution'] == 't93':
dist_mat = normalise_t93(dist_mat, dArgs['refgenome'], aSampleNames)
elif dArgs['substitution'] == 'number_of_differences':
pass
else:
raise NotImplementedError
return dist_mat
# --------------------------------------------------------------------------------------------------
[docs]def normalise_t93(d, ref, names):
'''
Normalise distance matrix according to the Tamura 3-parameter distance model
see: Nei and Zhang: Evolutionary Distance: Estimation,
ENCYCLOPEDIA OF LIFE SCIENCES 2005,
doi: 10.1038/npg.els.0005108
http://www.umich.edu/~zhanglab/publications/2003/a0005108.pdf, equation 16
Parameters
----------
d: dict
distance matrix
ref: str
reference genome file name
names: list
list of sample names
Returns
-------
d: dict
normalised matrix
'''
(dRefFreq, flGenLen) = get_ref_freqs(ref)
flGC = dRefFreq['C'] + dRefFreq['G']
h = 2.0 * flGC * (1.0 - flGC)
for i, sample_1 in enumerate(names):
for j, sample_2 in enumerate(names):
if j < i:
Q = d[sample_1][sample_2][1] / flGenLen
p = sum(d[sample_1][sample_2]) / flGenLen
x1 = -h * math.log(1.0 - p / h - Q)
x2 = 0.5 * (1.0 - h) * math.log(1.0 - (2.0 * Q))
d[sample_1][sample_2] = x1 - x2
elif i == j:
d[sample_1][sample_2] = 0.0
return d
# --------------------------------------------------------------------------------------------------
[docs]def normalise_tn84(d, ref, names):
"""
Normalise distance matrix according to the Kimura 2-parameter distance model
see: Nei and Zhang: Evolutionary Distance: Estimation,
ENCYCLOPEDIA OF LIFE SCIENCES 2005,
doi: 10.1038/npg.els.0005108
http://www.umich.edu/~zhanglab/publications/2003/a0005108.pdf, equation 13 and 14
Parameters
----------
d: dict
distance matrix
d = {'211701_H15510030401': {'211700_H15498026501': {'A': {'A': 1152.0,
'C': 114.0,
'G': 545.0,
'T': 35.0},
'C': {'A': 0.0,
'C': 1233.0,
'G': 108.0,
'T': 467.0},
'G': {'A': 0.0,
'C': 0.0,
'G': 1283.0,
'T': 100.0},
'T': {'A': 0.0,
'C': 0.0,
'G': 0.0,
'T': 1177.0}}, ...}, ...}
ref: str
reference genome file name
names: list
list of sample names
Returns
-------
d: dict
normalised matrix
"""
(dRefFreq, flGenLen) = get_ref_freqs(ref)
sum1 = sum([x * x for x in dRefFreq.values()])
for i, sample_1 in enumerate(names):
for j, sample_2 in enumerate(names):
if j < i:
iNofDiff = getTotalNofDiff_tn84(d[sample_1][sample_2])
p = iNofDiff / flGenLen
sum2 = 0.0
for m, nuc1 in enumerate(['A', 'C', 'G', 'T']):
for n, nuc2 in enumerate(['A', 'C', 'G', 'T']):
if m < n:
flFreqNucPair = d[sample_1][sample_2][nuc1][nuc2] / flGenLen
sum2 += ((flFreqNucPair ** 2.0) / (2.0 * dRefFreq[nuc1] * dRefFreq[nuc2]))
if sum2 == 0.0: # samples are "identical"
d[sample_1][sample_2] = 0.0
else:
b = 0.5 * ((1.0 - sum1) + (p * p / sum2))
d[sample_1][sample_2] = -b * math.log(1.0 - (p / b))
elif i == j:
d[sample_1][sample_2] = 0.0
else: # j > i
pass
return d
# --------------------------------------------------------------------------------------------------
[docs]def normalise_k80(d, ref, names):
'''
Normalise distance matrix according to the Tajima-Nei distance model
see: Nei and Zhang: Evolutionary Distance: Estimation,
ENCYCLOPEDIA OF LIFE SCIENCES 2005,
doi: 10.1038/npg.els.0005108
http://www.umich.edu/~zhanglab/publications/2003/a0005108.pdf, equation 9
Parameters
----------
d: dict
distance matrix
ref: str
reference genome file name
names: list
list of sample names
Returns
-------
d: dict
normalised matrix
'''
(_, flGenLen) = get_ref_freqs(ref, len_only=True)
for i, sample_1 in enumerate(names):
for j, sample_2 in enumerate(names):
if j < i:
P = d[sample_1][sample_2][0] / flGenLen
Q = d[sample_1][sample_2][1] / flGenLen
w1 = 1.0 - (2.0 * P) - Q
w2 = 1.0 - (2.0 * Q)
d[sample_1][sample_2] = (-0.5 * math.log(w1)) - (0.25 * math.log(w2))
elif i == j:
d[sample_1][sample_2] = 0.0
return d
# --------------------------------------------------------------------------------------------------
[docs]def normalise_jc69(d, ref, names):
'''
Normalise distance matrix according to the Jukes-Cantor distance model
see: Nei and Zhang: Evolutionary Distance: Estimation,
ENCYCLOPEDIA OF LIFE SCIENCES 2005,
doi: 10.1038/npg.els.0005108
http://www.umich.edu/~zhanglab/publications/2003/a0005108.pdf, equation 7
Parameters
----------
d: dict
distance matrix
ref: str
reference genome file name
names: list
list of sample names
Returns
-------
d: dict
normalised matrix
'''
flGenLen = 0.0
with open(ref, 'r') as fRef:
for sLine in fRef:
if sLine.startswith(">"):
continue
else:
flGenLen += len(sLine.strip())
logging.info("genome length: %s" % flGenLen)
for i, sample_1 in enumerate(names):
for j, sample_2 in enumerate(names):
if j < i:
p = d[sample_1][sample_2] / flGenLen
d[sample_1][sample_2] = (-3.0 / 4.0) * math.log(1.0 - ((4.0 / 3.0) * p))
return d
# --------------------------------------------------------------------------------------------------
[docs]def get_difference_value(s1_base, s2_base, sSubs):
'''
Get difference value for a given set of bases.
Parameters
----------
s1_base: str
a charcater
s2_base: str
a charcater
sSubs: str
distance model
Returns
-------
difference: float or list
depending on the distance model either a float 1.0 or 0.0
of a list of two floats
'''
if sSubs == 'number_of_differences' or sSubs == 'jc69':
try:
_ = dValChars[s1_base]
except KeyError:
return 0.0
try:
_ = dValChars[s2_base]
except KeyError:
return 0.0
if s1_base == s2_base:
return 0.0
else:
return 1.0
elif sSubs == 'k80' or sSubs == 't93':
try:
_ = dValChars[s1_base]
except KeyError:
return [0.0, 0.0]
try:
_ = dValChars[s2_base]
except KeyError:
return [0.0, 0.0]
return dK80[s1_base][s2_base]
else:
raise NotImplementedError
# --------------------------------------------------------------------------------------------------
[docs]def getTotalNofDiff_tn84(d):
"""
Sum up total number of differences for a dict like the one in the input
Parameters
----------
d: dict
{'A': {'A': 1152.0, 'C': 114.0, 'G': 545.0, 'T': 35.0},
'C': {'A': 0.0, 'C': 1233.0, 'G': 108.0, 'T': 467.0},
'G': {'A': 0.0, 'C': 0.0, 'G': 1283.0, 'T': 100.0},
'T': {'A': 0.0, 'C': 0.0, 'G': 0.0, 'T': 1177.0}}
Returns
-------
t: float
the sum of all differences(1369.0 in above example case)
"""
t = 0.0
for k, v in d.items():
t += sum([y if x != k else 0.0 for x, y in v.items()])
return t
# --------------------------------------------------------------------------------------------------
[docs]def get_ref_freqs(ref, len_only=False):
"""
Get the length of the reference genome and optionally the nucleotide frequencies in it
Parameters
----------
ref: str
reference genome filename
len_only: boolean
get genome lengths only [default FALSE, also get nucleotide frequencies]
Returns
-------
(dRefFreq, flGenLen): tuple
dRefFreq: dict
dRefFreq = {'A': 0.25, 'C': 0.24, 'G': 0.26, 'T': 0.25}
flGenLen: float
genome length
"""
# get frequency of a, c, g, and t in ref
dRefFreq = {'A': 0.0, 'C': 0.0, 'G': 0.0, 'T': 0.0}
flGenLen = 0.0
with open(ref, 'r') as fRef:
for sLine in fRef:
if sLine.startswith(">"):
continue
else:
sLine = sLine.strip()
flGenLen += len(sLine)
if len_only == True:
continue
sLine = sLine.upper()
for n in sLine:
try:
dRefFreq[n] += 1.0
except KeyError:
pass
if len_only == False:
for (i, j) in dRefFreq.items():
dRefFreq[i] = j / flGenLen
logging.info("genome length: %s" % flGenLen)
return (dRefFreq, flGenLen)
# --------------------------------------------------------------------------------------------------
[docs]def calculate_memory_for_sort():
"""Calculate available memory for ``samtools sort`` function.
If there is enough memory, no temp files are created. **Enough**
is defined as at least 1G per CPU.
Returns
-------
sort_memory: str or None
String to use directly with *-m* option in sort, or None.
"""
avail_memory = None
if HAVE_PSUTIL == True:
mem = virtual_memory()
avail_memory = mem.total
else:
try:
avail_memory = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES') # e.g. 4015976448
except ValueError:
logging.error("If you're in Mac OS you need to have the psutil Python library.")
raise SystemExit
avail_cpu = multiprocessing.cpu_count()
sort_memory = avail_memory / avail_cpu / 1024 ** 2
# samtools default from documentation is 768M, to be conservative
# only use -m option when there is more than 1G per CPU.
if sort_memory < 1024:
sort_memory = None
else:
sort_memory = "%sG" % (int(floor(sort_memory / 1024)))
return sort_memory
# --------------------------------------------------------------------------------------------------
if __name__ == "__main__":
print calculate_memory_for_sort()