import glob,os
import numpy as np
from scipy.stats import linregress

# container class for mean squared displacement and mean squared rotation
class MSD:
    def __init__(self,simid,cellid,time,trans,rot):
        self.simid = simid
        self.cellid = cellid
        self.time = time-time[0]
        self.trans = np.power(np.sqrt(trans)-np.sqrt(trans[0]),2)
        self.rot = np.power(np.sqrt(rot)-np.sqrt(rot[0]),2) 

# save MSD/MSR for a bin
def saveAllMSDPerBin(cells,key,basename,dt,outdir):
    (msdA,msdT) = mergeMSD(cells)
    tlist = np.arange(0,len(msdA))*dt
    mA = np.mean(msdA,axis=1)
    sA = np.std(msdA,axis=1)
    mT = np.mean(msdT,axis=1)
    sT = np.std(msdT,axis=1)
    idx = 3
    n = msdA.shape[1]*np.ones_like(sT)
    f = open(outdir+basename+'_msd_csz='+str(key)+'.data','w')
    f.write('#MCS\tmsdT\t+/-\tmsdA\t+/-\tnsamples\n')
    np.savetxt(f,np.column_stack((tlist,mT,sT,mA,sA,n)))     

# create array of MSD and MSR from a list of cells that are in a cluster bin
# We first look for the shortest trajectory in cells, the length of this trajectory is l.
# Then, we cut all MSD and MSR lists in sublists of length l and add these to the MSD or MSR matrix.
def mergeMSD(cells):
    llist = [len(msd.rot) for msd in cells]
    l = np.min(llist)    
    msdA = np.empty((l,0))
    msdT = np.empty((l,0))
    for msd in cells:
        i = 0
        while (i+l)<=len(msd.rot):
            msdA = np.column_stack((msdA,np.power(np.sqrt(msd.rot[i:i+l])-np.sqrt(msd.rot[i]),2)))
            msdT = np.column_stack((msdT,np.power(np.sqrt(msd.trans[i:i+l])-np.sqrt(msd.trans[i]),2)))
            i += l
    return (msdA,msdT)

# calculate the time intervals in which values in val are in the same bin
def calcTimeInBins(val,bins):
    # map binmids on val; all items in val with the same value are in the same bin
    for low,up in bins:
        val[np.where((val<=up)&(val>low))] = 0.5*(low+1+up)
    # find positions where val[i] != val[i+1]: bin transitions
    dval = val[1:len(val)]-val[0:-1]
    # find indices of bin transitions
    idx = np.concatenate(([-1],np.where(dval!=0)[0],[len(dval)]))
    # find bins that belong to the transitions
    cs = val[idx[0:-1]+1]
    # find length of time intervals in bins: difference between consecutive bin transitions
    ln = idx[1:len(idx)]-idx[0:-1]
    return np.column_stack((cs,ln,idx[0:-1]))

# read clusters and MSD for each cell (generated by driver.py)
def readDataPerSim(bins,nid,indir,minsteps):
    msddict = {}
    # find all files with clusters and MSD
    for f in glob.glob(indir+nid+'/'+nid+'_'+'cell_*_cluster+MSD.data'):
        cid = int(f.split('/')[-1].replace(nid+'_cell_','').replace('_cluster+MSD.data',''))
        t,msdtrans,msdrot,ci,cs = np.loadtxt(f,usecols=(0,1,2,3,4),unpack=True)
        cs[np.isnan(cs)] = -1        
        tbins = calcTimeInBins(cs,bins)
        # find time intervals in which cell is in a cluster
        tbins = tbins[np.where(tbins[:,0]>0)]
        if tbins.shape[0] == 0:
            continue
        # find time intervals longer than minsteps
        tbins = tbins[np.where(tbins[:,1]>=minsteps)]
        # add time intervals to MSD instances for each bin
        idx = np.column_stack((tbins[0:-1,2]+1,tbins[0:-1,2]+1+tbins[0:-1,1]))
        for pos in idx:
            msddict.setdefault(cs[pos[0]],[]).append(MSD(nid,cid,t[pos[0]:pos[1]],
                msdtrans[pos[0]:pos[1]],msdrot[pos[0]:pos[1]]))
    return msddict

def calcD(file,minN=None):
    (t,msdT,stdT,msdR,stdR,n) = np.loadtxt(file,unpack=True,usecols=(0,1,2,3,4,5))
    # check if the number of trajectories is larger than minN
    if minN is not None:
        if n[0] < minN:
            return np.nan*np.ones((6))
    # calculate diffusion coefficients with linear regression
    (sT,iT,rT,pT,eT) = linregress(t,msdT)
    (sR,iR,rR,pR,eR) = linregress(t,msdR)
    return [sT,eT,pT,sR,eR,pR]


#--- setttings ---#
# path where data files are located, e.g. longcells_nochem_CF_0.data
datapath = 'data/'
# names of the simulations
simname = 'longcells_nochem_003'
# path to results
outpath = 'results/'
# list of repeat numbers
repeats = range(1,11)
# time interval and list of time steps used for diffusion coefficients
dt = 250
mcslist = range(500,100001,dt)
# minimum number of time steps in a trajectory
minsize=10
# create "cluster size" bins
bsz = 5
s0 = 0
bins = tuple((i,i+bsz) for i in range(s0,50,bsz))

# bin trajectories based on cluster sizes
msddict = {}
for n in repeats:
    # read data for a single simulation
    # returns a dictionary with bins and lists of msd's per bin
    ndata = readDataPerSim(bins,simname+'-'+str(n),datapath,minsize)
    for key,msdlist in ndata.iteritems():
        msddict[key] = (msddict.get(key,[]))+msdlist

# save mean msd for all bins
basename = simname+'_'+'binsz='+str(bsz)+'_binstart='+str(bins[0][0]+1)
for key in msddict:
    saveAllMSDPerBin(msddict[key],key,basename,dt,outpath)

# calculate translational and rotational diffusion coefficients for each bin
# cutoff value for p-values; fits with p-values larger than alpha are ingored
alpha = 1
# minimum number of trajectories needed to use a bin
minN = 50

data = np.zeros((len(bins),7))
for i,bin in enumerate(bins):        
    binmid = .5*(1+bin[0]+bin[1])
    data[i,0] = binmid
    fn = outpath+basename+'_msd_csz='+str(binmid)+'.data'
    if os.path.isfile(fn):
        [slopeT,errT,pT,slopeR,errR,pR] = calcD(fn,minN=minN)
        if pT > alpha:
            slopeT = np.nan
            errT = np.nan
        if pR > alpha:
            slopeR = np.nan
            errR = np.nan
        data[i,1:7] = [slopeT,errT,pT,slopeR,errR,pR] 
    else:
        data[i,1:7] = [np.nan,np.nan,np.nan,np.nan,np.nan,np.nan]
f = open(outpath+simname+'_D.data','w')
f.write('#binmid\tslopeT\terrT\tpT\tslopeR\terrR\tpR\n')
np.savetxt(f,data)
f.close()
