#!env python2

import random
from string import maketrans

# probabilities of reading a certain base instead of the correct one if we decided to read a wrong base
SNPprob = {'A': {'A':0.0,   'C':1.0/3, 'G':1.0/3, 'T':1.0/3},
           'C': {'A':1.0/3, 'C':0.0,   'G':1.0/3, 'T':1.0/3},
           'G': {'A':1.0/3, 'C':1.0/3, 'G':0.0,   'T':1.0/3},
           'T': {'A':1.0/3, 'C':1.0/3, 'G':1.0/3, 'T':0.0}}

# reverse complement
complement_table = maketrans('ACGT', 'TGCA')
def reverseComplement(s):
  return s.translate(complement_table)[::-1]

# return the reverse complement if the gods of the Mersenne twister favor you
# EDIT: the project is hard enough without having to worry about reverse complements...
def maybe_reverse(s):
  return s

#def maybe_reverse(s):
#  if random.getrandbits(1):
#    return s
#  else:
#    return reverseComplement(s)


# return a random base
def rndBase():
	return 'ATCG'[random.randint(0,3)]

# return a string made up of random bases
def rndString(n):
	out=''
	for x in xrange (0, n):
		out += rndBase()
	return out

# return a read of the reference genome "ref" at position "pos" with errors according to pIns, pDel, pSNP
def getReadWithStartingPoint(ref, pos, readLength, pDel, pIns, pSNP):
  out = ''
  while len(out) < readLength:
    p = random.random() - pIns
    if p < 0 or pos == len(ref):
      out += rndBase()
      continue
    p -= pDel
    if p < 0:
      pos += 1
      continue
    p -= pSNP
    if p < 0:
      q = random.random()
      for replace in ('A', 'C', 'G', 'T'):
        q -= SNPprob[ref[pos]][replace]
        if q < 0:
          out += replace
          break
      else:
        out += rndBase()
      pos += 1
      continue
    out += ref[pos]
    pos += 1
  return out

def getRead(ref, readLength, pDel, pIns, pSNP):
	pos = random.randint(0, len(ref) - readLength)
	return getReadWithStartingPoint(ref, pos, readLength, pDel, pIns, pSNP)	
	

def getPairedEnd(ref, readLength, gap, pDel, pIns, pSNP):
	gap = random.randint(gap, 2*gap)
	pos = random.randint(0, len(ref) - 2*readLength - gap)
	return getReadWithStartingPoint(ref, pos, readLength, pDel, pIns, pSNP) +\
        ' ' +\
        getReadWithStartingPoint(ref, pos + readLength + gap, readLength, pDel, pIns, pSNP)

def main(params):
  random.seed(params['seed'])
  
  ref = rndString(params['n'])
  print "complete genome:\n%s\n" % ref
  numShortReadPairs = int(params['n'] * params['shortReadCoverage'] / params['shortReadLength'] / 2)
  numLongReads = int(params['n'] * params['longReadCoverage'] / params['longReadLength'])
  
  print "generating %d short read-pairs (length %d), %d long reads (length %d)" % (numShortReadPairs, params['shortReadLength'], numLongReads, params['longReadLength'])
  print "%d bases, each contained in (on avg) %d short read pairs and %d long reads\n" % (params['n'], params['shortReadCoverage'], params['longReadCoverage'])

  # print mean insert size
  print (3 * params['pairedEndGap']) / 2
  for x in xrange (0, numShortReadPairs):
    print maybe_reverse(getPairedEnd(ref, params['shortReadLength'], params['pairedEndGap'], params['pDeletionShort'], params['pInsertionShort'], params['pSNPShort']))
  for x in xrange (0, numLongReads):
    print maybe_reverse(getRead(ref, params['longReadLength'], params['pDeletionLong'], params['pInsertionLong'], params['pSNPLong']))

if __name__ == '__main__':
	params={
			'seed':None,             #seed for the random number generator; fixed seeds will be used for evaluation
			'n':200,                 #number of base-pairs in the sequence. Ranges from 1 to 10^6
			'shortReadCoverage':5,   #average number of short reads covering a single base-pair: 10 in scenario 'short', 5 in scenario 'mixt', 0 in scenario 'long'
			'longReadCoverage':5,    #average number of long reads covering a single base-pair: 0 in scenario 'short', 5 in scenario 'mixt', 10 in scenario 'long'
			'shortReadLength':15,    #length of a short read, ranges from 15 to 100
			'longReadLength':100,    #length of a long read, ranges from 150 to 1000
			'pairedEndGap':15,       #minimum gap between consecutive short reads (max = *2), ranges from 15 to 150
			'pDeletionShort':0.02,   #probability of deletion in short read (<0.05)
			'pInsertionShort':0.02,  #probability of insertion in short read (<0.05)
			'pSNPShort':0.02,        #probability of SNP in short read (<0.05)
			'pDeletionLong':0.06,    #probability of deletion in long reads (>0.05, <0.1)
			'pInsertionLong':0.06,   #probability of insertion in long reads (>0.05, <0.1)
			'pSNPLong':0.06          #probability of SNP in long reads (>0.05, <0.1)
		}
	main(params)
