...
  View open merge request
Commits (10)
......@@ -3,4 +3,4 @@ VIDJIL_JSON_VERSION_REQUIRED = "2014.10"
VIDJIL_JSON_VERSION = "2016b"
# Directory that specifies where Fuse preprocesses are stored.
# PRE_PROCESS_DIR = "/opt/fuse-pre-process"
PRE_PROCESS_DIR = "."
#!/usr/bin/env python
# -*- coding: utf-8 -*-
''' Program to perform spike-in normalization in .vidjil files.
Developed at the Boldrini Center, Brazil, in 2018-2019.
'''
############################################################
### imports
from __future__ import print_function
import sys
import json
import os
import math
############################################################
### constants
version = 'S0.01'
UNIVERSAL = 'universal'
############################################################
### routines
def linearRegression(y, x):
'''
function for curve fitting y = ax
just slope, no intercept, to force (0,0)
'''
lr = {}
n = len(y)
assert(n == len(x))
lr['n'] = n
sum_x = 0.0
sum_y = 0.0
sum_xy = 0.0
sum_xx = 0.0
sum_yy = 0.0
for i in range(n):
sum_x += x[i]
sum_y += y[i]
sum_xy += (x[i]*y[i])
sum_xx += (x[i]*x[i])
sum_yy += (y[i]*y[i])
lr['slope'] = sum_xy / sum_xx
denom = (n*sum_xx-sum_x*sum_x)*(n*sum_yy-sum_y*sum_y)
lr['r2'] = math.pow((n*sum_xy - sum_x*sum_y),2)/denom if denom > 0 else 'N/A'
lr['s'] = math.sqrt((sum_yy - sum_xy*sum_xy/sum_xx)/n)
return lr
def computeCopies(data):
reads = {}
copies = {}
## get config
for spike in data['config']['labels']:
copies[spike['name']] = spike['copies']
## get reads
for clone in data['clones']:
if 'label' in clone:
## spike-in clone
label = clone['label']
## show spike name
clone['name'] = label + ' ' + clone['name']
## grab read data
a = clone['reads']
if len(a) > 1:
print(' *** reads array with many elements', file=sys.stderr)
## add reads
if label not in reads:
reads[label] = 0.0
reads[label] += a[0]
return copies, reads, data['reads']['segmented'][0]
############################################################
def computeUniversalCoefficient(copies, reads):
'''
curve-fitting
skip computing f for each user family
will do just universal coefficient
'''
f = {}
r2 = {}
perr = {}
### universal coefficient
### uses one point per copy number
### readsList is the list or read counts for each copy #
readsList = {}
for label in reads:
nCopies = copies[label]
if nCopies not in readsList:
readsList[nCopies] = []
readsList[nCopies].append(reads[label])
### x = copy number
### y = mean of reads for this copy number
x = []
y = []
for nCopies in readsList:
nReadsList = readsList[nCopies]
s = sum(nReadsList)
c = len(nReadsList)
x.append(1.0*s/c)
y.append(1.0*int(nCopies))
## print('{0:5} {1:10.1f}'.format(s/c, int(nCopies)))
### test for not enough items
if len(x) == 0 or max(x) <= 10.0:
print('** Not enough spike-ins **', file=sys.stderr)
print('** No normalization performed **', file=sys.stderr)
else:
### fit curve
lr = linearRegression(y, x)
f[UNIVERSAL] = lr['slope']
r2[UNIVERSAL] = lr['r2']
perr[UNIVERSAL] = lr['s']
if msgs >= 1:
fmtStr = 'Universal coefficient estimation: {0:15.13f} s: {1:5.1f} r2: {2:15.13f}'
print(fmtStr.format(f[UNIVERSAL], perr[UNIVERSAL], r2[UNIVERSAL]), file=sys.stderr)
return f
############################################################
### add normalized reads
def addNormalizedReads(data, coeff, totalReads):
if msgs >= 1:
print('Normalizing reads and prinitng output file', file=sys.stderr)
### all clones will be normalized
for clone in data['clones']:
## grab read data
a = clone['reads']
if len(a) > 1:
print(' *** reads array with many elements', file=sys.stderr)
## update counters for clones
reads = a[0]
clone['normalized_reads'] = [ reads*coeff*totalReads/100000 ]
############################################################
### command line, initial msg
if __name__ == '__main__':
inf = sys.argv[1]
outf = sys.argv[2]
msgs = 1 if len(sys.argv) >= 4 else 0
# read input file
if msgs >= 1:
print('Reading follow-up file', file=sys.stderr)
with open(inf) as inp:
data = json.load(inp)
# process data
copies, reads, totalReads = computeCopies(data)
f = computeUniversalCoefficient(copies, reads)
if UNIVERSAL in f:
addNormalizedReads(data, f[UNIVERSAL], totalReads)
# write output file
with open(outf, 'w') as of:
print(json.dumps(data, sort_keys=True, indent=2), file=of)
This diff is collapsed.
This diff is collapsed.
!LAUNCH: python ../../spike-normalization.py ../data/input-spike-norm.vidjil out.vidjil && cat out.vidjil
$ 13 clones in the output, as in the input
13: "id"
$ "normalized_reads" fields for each clone
13: "normalized_reads"
$ Good value for the first clone
: 1.3892807