#import libraries for program

import mmap
import re
import numpy
import pandas
import sys
import os
import argparse

#-------Define Functions-------#

#Define functions to calculate mapping statistics. These will count the number of unique reads in a set of alignments.
#Two functions are defined, one for SE reads (merged with pear) and one for handling unmerged paired end reads (file contains R1 and R2)	
def uniquereadsSE(x):
	reads=[]
	for line in x:
		col=line.split()
		reads.append(col[0])
	unqreads=len(set(reads))
	return unqreads

def uniquereadsPE(x):
	R1=[]
	R2=[]
	for line in x:
		col=line.split()
		if int(col[1]) & 64:
			R1.append(col[0])
		elif int(col[1]) & 128:
			R2.append(col[0])	
	unqreadsR1=len(set(R1))
	unqreadsR2=len(set(R2))
	unqreadsComb=len(set(R1+R2))
	return unqreadsR1, unqreadsR2, unqreadsComb

#Define classical rounding function - round 5 up - used for mapping statistics calculation
def rd(x,y=0):
	m = int('1'+'0'*y) # multiplier - how many positions to the right
	q = x*m # shift to the right by multiplier
	c = int(q) # new number
	i = int( (q-c)*10 ) # indicator number on the right
	if i >= 5:
		c += 1
	return c/m
	
#Define function to parse CIGAR strings in the SAM file. 
#If the alignment is on the left side of the read, need to add alignment length to position to determine junction location (since leftmost position is always listed)
#If the alignment is on the right side of the read, do not need to adjust the alignment length
#Also includes a flag in case of clipping on both sides of the read (usually indicative of poor alignment). Should not be present in quality filtered reads.
def parsecigar(x):
	m=re.findall('(\d+)([a-zA-Z]+?)',x)
	m2=[item for sublist in m for item in sublist]
	flag=0
	if 'S' in m2 or 'H' in m2:
		if m2.count('M')==1:
			if m2[1]=='M':
				addbases=int(m2[0])-1
			else:
				addbases=0
		else:
			stop=[i for i,e in enumerate([x[1] for x in m]) if (e == 'S' or e == 'H')][0] #finds the index of the first S or H in the CIGAR string			
			addbases = sum([int(i) for i in [x[0] for x in m][:stop]])-1 #add all of the numbers preceding the clipping to shift the junction position
		if m2.count('S')>1 or m2.count('H')>1:
			flag=1 #introduces flag in case of clipping on both sides of the read
	else:
		raise Exception('Error, CIGAR input is missing clipping, non split reads given as input')
	return addbases, flag

#-------End Functions-------#


def main():
	#Create argument parser object. Takes user inputs for script and parses them.
	parser=argparse.ArgumentParser(description='Filter a SAM file to find Vector integration sites in the genome.')
	optional = parser._action_groups.pop() # creates group of optional arguments
	required = parser.add_argument_group('required arguments') # creates group of required arguments

	#Add input arguments for program. Sample and vector name are required, the rest will have default values if not specified
	required.add_argument('-s','--sam',help='name of SAM file',required=True)
	required.add_argument('-v','--vector',help='name of vector scaffold',required=True)
	optional.add_argument('-c','--cutoff',help='specify number of reads to use as a cutoff for a true integration site (default = 5 reads). If set to 0, will use mean+/-stdev cutoff.',type=int,default=5)
	optional.add_argument('-p','--PE',help='specify type of input file: paired end reads (PE=1,default), or reads merged with pear (PE=0)',type=int,default=1,choices=[0,1])
	optional.add_argument('-i','--indir',help='directory containing SAM file (default: current working directory)',default=(os.getcwd()+'/'))
	optional.add_argument('-o','--outdir',help='output directory (default: current working directory)',default=(os.getcwd()+'/'))
	optional.add_argument('-m','--mapq',help='minimum MAPQ for split reads (default: 30)', type=int, default=30)
	optional.add_argument('-nm','--mismatch',help='maximum number of mismatches for split reads (default: 3)', type=int, default=3)

	parser._action_groups.append(optional) #add optional values after the required ones in the help file

	args = parser.parse_args() # get the arguments from the program input, set them to args
	
	#Assign user inputs to variables for use in the program
	VecScaf=args.vector
	mapq=args.mapq-1 #minus 1 to MAPQ cutoff supplied since the filtering is for greater than, not greater or equal
	mismatch=args.mismatch+1 #add 1 to number of mismatches supplied since the filtering is for less than, not less or equal
	PE=args.PE
	outputdir=args.outdir

	#Check to see if .sam was included when SAM file name was supplied, and modify Sample and filename variables accordingly
	if args.sam.endswith('.sam'):
		Sample=args.sam[:-4]
		filename=args.indir+args.sam
	elif args.sam.endswith('.bam'):
		print('Error: BAM file not allowed as input. SAM file is required.\nProgram exit.')
		sys.exit(1)
	else:
		Sample=args.sam
		filename=args.indir+args.sam+'.sam'
	
	print("Sample Name:",Sample)
	print("SAM file:",filename)
	
	if PE==0:
		print("Running in single end mode (for reads merged with PEAR).")	
	else:
		print("Running in paired-end read mode.")
	
	print('\nStarting SAM filtering...')
	
	#Create list variables to hold alignments of various categories
	data=[]
	lowqual=[]
	header=[]
	Vectonly=[]
	Genomeonly=[]
	unmapped=[]

	#Filter the SAM file. 
		#1. First, ignore header lines of the file (lines that start with @), and sort them into their own list.
		#2. Search the file line by line and sort lines into different categories based on main scaffold and position, quality, and secondary scaffold(s) and position(s):
			#Split reads: contain SA:Z tag (Split read), Vector and Genome appear at least once in the primary or secondary scaffolds, and reads pass MAPQ and mismatch cutoffs
			#Low quality split reads: same as above, but failed MAPQ and mismatch test
			#Vector only: line contains only vector alignments. May contain Vector to Vector split reads.
			#Genome only: line contains only genome alignments. May contain Genome to Genome split reads.
			#Unmapped: Read did not map to genome or vector.
		#3. Throughout this process, a counter is used to keep track of the lines and sorting. Program will denote if any line did not fit into one of the above categories.

	count=[0,0,0,0,0,0,0,0] #1st element: number of lines with Vector in primary or split alignment passed quality, 2nd: number of lines with Vector and SA that failed quality
	#3rd: number of header lines, 4th: remaining lines (unaccounted for), 5th: total number of lines, 6th: lines which map only to Vector
	#7th: lines which map only to Genome, #8th: unmapped reads
		
	with open(filename, 'r+') as hFile:
		for line in hFile:
			if not line.startswith("@"):
				m = re.search('SA:Z:(\S+?),\d+?,\S+?,\S+?,(\d+?),(\d+?);(?:(\S+?),\d+?,\S+?,\S+?,(\d+?),(\d+?);)?',line)
				m2 = re.search('NM:i:(\d+)',line)
				col = line.split()
				if VecScaf in col[2] and m and not (m.group(1)==VecScaf and (m.group(4)==VecScaf or not m.group(4))):
					if (int(col[4])>mapq and (int(m.group(2))>mapq or (int(m.group(5)) if m.group(5) else 0)>mapq)) and (int(m2.group(1))<mismatch and (int(m.group(3))<mismatch or (int(m.group(6)) if m.group(6) else (mismatch+1))<mismatch)):
						data.append(line)
						count[0] += 1
					else:
						count[1] += 1
						lowqual.append(line)
				elif not VecScaf in col[2] and m and VecScaf in m.group(1,4):
					if (int(col[4])>mapq and (int(m.group(2))>mapq or (int(m.group(5)) if m.group(5) else 0)>mapq)) and (int(m2.group(1))<mismatch and (int(m.group(3))<mismatch or (int(m.group(6)) if m.group(6) else (mismatch+1))<mismatch)):
						data.append(line)
						count[0] += 1
					else:
						count[1] += 1
						lowqual.append(line)
				elif not int(col[1]) & 4:
					if VecScaf in col[2] and (not m or (m.group(1)==VecScaf and (not m.group(4) or m.group(4)==VecScaf))):
						Vectonly.append('\t'.join(col[0:2]))
						count[5] += 1
					elif not VecScaf in col[2] and (not m or (m.group(1)!=VecScaf and (not m.group(4) or m.group(4)!=VecScaf))):
						Genomeonly.append('\t'.join(col[0:2]))
						count[6] += 1
				elif int(col[1]) & 4:
					count[7] += 1
					unmapped.append('\t'.join(col[0:2]))
				else:
					count[3] += 1
			else:
				count[2] += 1
				header.append(line)
			count[4] += 1


	print('SAM filtering complete.')

	if sum(count[0:8])-count[4] == count[4]:
		print("All alignments accounted for.")
	else:
		print("Some aligments are missing from the count")

	#attach header back onto filtered split reads
	datawh=header + data
	datawh2=''.join(datawh)

	#Save new sam file containing only filtered alignments with Vector Genome junction
	with open(outputdir+Sample+'_SR.sam', "w") as out_file:
		out_file.write(datawh2)
	
	print("\nSaved filtered SAM file:",outputdir+Sample+'_SR.sam')

	#Calculate and print mapping statistics based on unique read names in each list category. Use different function for paired end or merged reads.
	#Also, creates stats variable to format statistics for table entry
	print('\nMapping Statistics:')
	if PE==1:
		x=uniquereadsPE((data+lowqual+Genomeonly+Vectonly+unmapped))
		v=uniquereadsPE(Vectonly)
		g=uniquereadsPE(Genomeonly)
		sr=uniquereadsPE(data)
		srlq=uniquereadsPE(lowqual)
		u=uniquereadsPE(unmapped)
		statsPE=[x[0],x[1],str(v[0])+" ("+str(rd(v[0]*100/x[0],1))+"%)",str(v[1])+" ("+str(rd(v[1]*100/x[1],1))+"%)",
							str(g[0])+" ("+str(rd(g[0]*100/x[0],1))+"%)",str(g[1])+" ("+str(rd(g[1]*100/x[1],1))+"%)",
							str(sr[0])+" ("+str(rd(sr[0]*100/x[0],4))+"%)",str(sr[1])+" ("+str(rd(sr[1]*100/x[1],4))+"%)",
							str(srlq[0])+" ("+str(rd(srlq[0]*100/x[0],4))+"%)",str(srlq[1])+" ("+str(rd(srlq[1]*100/x[1],4))+"%)",
							str(u[0])+" ("+str(rd(u[0]*100/x[0],1))+"%)",str(u[1])+" ("+str(rd(u[1]*100/x[1],1))+"%)"]
		print("Total Number of Reads:")
		print("\tR1 unique reads: ",statsPE[0],"\n\tR2 unique reads: ",statsPE[1],"\n\tUnique reads among both: ",x[2],sep='')	
		print('\nAligned to Vector Only:')
		print("\tR1 unique reads: ",statsPE[2],"\n\tR2 unique reads: ",statsPE[3],"\n\tUnique reads among both: ",v[2],sep='')
		print('\nAligned to Genome Only:')
		print("\tR1 unique reads: ",statsPE[4],"\n\tR2 unique reads: ",statsPE[5],"\n\tUnique reads among both: ",g[2],sep='')	
		print('\nSplit Reads between Vector-Genome (Passed Quality):')
		print("\tR1 unique reads: ",statsPE[6],"\n\tR2 unique reads: ",statsPE[7],"\n\tUnique reads among both: ",sr[2],sep='')	
		print('\nSplit Reads between Vector-Genome (Failed Quality):')
		print("\tR1 unique reads: ",statsPE[8],"\n\tR2 unique reads: ",statsPE[9],"\n\tUnique reads among both: ",srlq[2],sep='')
		print('\nUnmapped Reads:')
		print("\tR1 unique reads: ",statsPE[10],"\n\tR2 unique reads: ",statsPE[11],"\n\tUnique reads among both: ",u[2],sep='')
	elif PE==0:
		x=uniquereadsSE((data+lowqual+Genomeonly+Vectonly+unmapped))
		v=uniquereadsSE(Vectonly)
		g=uniquereadsSE(Genomeonly)
		sr=uniquereadsSE(data)
		srlq=uniquereadsSE(lowqual)
		u=uniquereadsSE(unmapped)
		statsSE=[x,str(v)+" ("+str(rd(v*100/x,1))+"%)",
							str(g)+" ("+str(rd(g*100/x,1))+"%)",
							str(sr)+" ("+str(rd(sr*100/x,4))+"%)",
							str(srlq)+" ("+str(rd(srlq*100/x,4))+"%)",
							str(u)+" ("+str(rd(u*100/x,1))+"%)"]	
		print("Total Number of Reads:",statsSE[0])
		print('Aligned to Vector Only: ',statsSE[1],sep='')
		print('Aligned to Genome Only: ',statsSE[2],sep='')
		print('Split Reads between Vector-Genome (Passed Quality): ',statsSE[3],sep='')
		print('Split Reads between Vector-Genome (Failed Quality): ',statsSE[4],sep='')	
		print('Unmapped Reads: ',statsSE[5],sep='')	
	else:
		print('Missing input parameter - Specify paired end vs. merged reads')


	#Isolate Vector-Genome Junction for each read. Uses the parsecigar function to adjust junction location based on CIGAR tag.
		#1. Looks only at non-secondary alignments based on SAM flag
		#2. Capture parts of the split alignment tag (the scaffold, position, and cigar tag)
		#3. Assign the vector, vector position, genome scaffold, and genome position to variables based on which part
		# 	of the alignment contained the correct information and after adjusting junction position with the CIGAR tag.
		#4. Add a column that denotes strand (R1 or R2) if using paired end reads, or NA if using merged reads.
		#5. Append junction for this read to a list.
	print('\nDeterming Vector-Genome junctions...')
	
	junctions=['ReadName\tPairEnd\tScaffold\tScaffoldPosition\tVector\tVectorPosition\n']
	f=0
	for line in data:
		col=line.split()
		if not int(col[1]) & 2048:
			m = re.search('SA:Z:(\S+?),(\d+?),\S+?,(\S+?),\d+?,\d+?;(?:(\S+?),(\d+?),\S+?,(\S+?),\d+?,\d+?;)?',line)
			if m.group(1) and not m.group(4): 
				if not VecScaf in col[2]:
					Gen=col[2]
					gpos=int(col[3])+parsecigar(col[5])[0]
					Vec=m.group(1)
					vpos=int(m.group(2))+parsecigar(m.group(3))[0]
				elif VecScaf in col[2]:
					Vec=col[2]
					vpos=int(col[3])+parsecigar(col[5])[0]
					Gen=m.group(1)
					gpos=int(m.group(2))+parsecigar(m.group(3))[0]
				f=f+parsecigar(col[5])[1]+parsecigar(m.group(3))[1]
			else:
				if not VecScaf in col[2]:
					if m.group(1)==VecScaf and m.group(4)==VecScaf:
						Gen=col[2]
						gpos=int(col[3])+parsecigar(col[5])[0]
						Vec=m.group(1)
						vpos=sorted([int(m.group(2))+parsecigar(m.group(3))[0],int(m.group(5))+parsecigar(m.group(6))[0]])
					elif m.group(1)==VecScaf and not m.group(4)==VecScaf:
						Gen=[col[2],m.group(4)]
						gpos=[int(col[3])+parsecigar(col[5])[0],int(m.group(5))+parsecigar(m.group(6))[0]]
						Vec=m.group(1)
						vpos=int(m.group(2))+parsecigar(m.group(3))[0]
					elif not m.group(1)==VecScaf and m.group(4)==VecScaf:
						Gen=[col[2],m.group(1)]
						gpos=[int(col[3])+parsecigar(col[5])[0],int(m.group(2))+parsecigar(m.group(3))[0]]
						Vec=m.group(4)
						vpos=int(m.group(5))+parsecigar(m.group(6))[0]	
				elif VecScaf in col[2]:
					if not VecScaf in m.group(1,4):
						Gen=[m.group(1),m.group(4)]
						gpos=[int(m.group(2))+parsecigar(m.group(3))[0],int(m.group(5))+parsecigar(m.group(6))[0]]
						Vec=col[2]
						vpos=int(col[3])+parsecigar(col[5])[0]
					elif m.group(1)==VecScaf and not m.group(4)==VecScaf:
						Gen=m.group(4)
						gpos=int(m.group(5))+parsecigar(m.group(6))[0]
						Vec=col[2]
						vpos=sorted([int(col[3])+parsecigar(col[5])[0],int(m.group(2))+parsecigar(m.group(3))[0]])
					elif not m.group(1)==VecScaf and m.group(4)==VecScaf:
						Gen=m.group(1)
						gpos=int(m.group(2))+parsecigar(m.group(3))[0]
						Vec=col[2]
						vpos=sorted([int(col[3])+parsecigar(col[5])[0],int(m.group(5))+parsecigar(m.group(6))[0]])
				f=f+parsecigar(col[5])[1]+parsecigar(m.group(3))[1]+parsecigar(m.group(6))[1]
			if int(col[1]) & 64:
				strand='R1'
			elif int(col[1]) & 128:
				strand='R2'
			else:
				strand='NA'
			junctions.append('\t'.join(map(str,[col[0],strand,Gen,gpos,Vec,vpos]))+'\n')

	print("Done.",(len(junctions)-1),"reads processed.")
	
	#Join the list of junctions together
	junctions2=''.join(junctions)

	#Create a pandas dataframe from the data for further processing
	jdata=[i.split('\t') for i in junctions2.splitlines()]
	d1 = pandas.DataFrame(jdata[1:],columns=jdata[0])
	
	#Write junction locations for each read to file
	d1.to_csv(outputdir+Sample+'_SR_readjunctions.txt',sep='\t',index=False,header=True)
	print("\nSaved junction locations for each read to file:",outputdir+Sample+'_SR_readjunctions.txt')
	
	#Count number of reads supporting each integration site
	d2= d1.groupby(['Scaffold','ScaffoldPosition','Vector','VectorPosition']).size().reset_index(name='ReadCount').sort_values(by=['ReadCount','Scaffold'],ascending=False).reset_index(drop=True)

	#Add column for paired end reads which counts only unique DNA fragments supporting each read
	if PE==1:
		#if using paired end reads without merging, include an additional column which counts the number of unique reads supporting the integration site (count R1 and R2 as one)
		d3=d1[['ReadName','Scaffold','ScaffoldPosition','Vector','VectorPosition']].drop_duplicates()
		d4=d3.groupby(['Scaffold','ScaffoldPosition','Vector','VectorPosition']).size().reset_index(name='UniqueReadCount').sort_values(by=['UniqueReadCount','Scaffold'],ascending=False).reset_index(drop=True)
		dm=pandas.merge(d2,d4,on=['Scaffold','ScaffoldPosition','Vector','VectorPosition']).sort_values(by=['UniqueReadCount','Scaffold'],ascending=False).reset_index(drop=True)
	elif PE==0:
		#no need to modify read count file if using reads merged with pear or single end reads
		dm=d2

	print("\nTotal number of integration sites found:",len(dm))
	
	#Filter the integration site file based on user specified cutoff. Use unique reads for paired end, and total number for merged reads.
	#Output statistics for this sample to general table for comparison between samples
	#Create filtered read file (dmfilt). dm is sorted by # of reads, so can pick up top 'rcutoff' # of sites
	rcutoff=0
	if PE==1:
		meanstd=numpy.mean(dm['UniqueReadCount'])+numpy.std(dm['UniqueReadCount']) #filter integration sites to have more reads supporting them than the mean + 1 standard deviation
		if args.cutoff==0:
			cut=meanstd
		else:
			cut=args.cutoff
		for i in range(0,len(dm)):
			if dm['UniqueReadCount'].tolist()[i]>=cut:
				rcutoff+=1
			else:
				break
		ms=str(rd(numpy.mean(dm['UniqueReadCount']),2))+" ± "+str(rd(numpy.std(dm['UniqueReadCount']),2))
		print("Mean number of reads per integration site:",ms)
		print("Read cutoff for called integration sites:", rd(cut,0))
		dmfilt=dm[:rcutoff]
		tablehPE=['Sample','IntSite#','TotalR1','TotalR2','VectorOnlyR1','VectorOnlyR2','GenomeOnlyR1','GenomeOnlyR2','SR_Vector-GenomeR1','SR_Vector-GenomeR2','SR_lowqualR1','SR_lowqualR2','UnmappedR1','UnmappedR2','MeanReads/Site','ReadCutoff']
		if os.path.exists(outputdir+'Sorted_reads_statistics.txt'):
			statsdf=pandas.read_csv(outputdir+'Sorted_reads_statistics.txt',sep='\t')
			statsdfadd=pandas.DataFrame([[Sample,len(dmfilt)]+statsPE+[ms,rd(cut,0)]],columns=tablehPE)
			statsdfnew=statsdf.append(statsdfadd)
			statsdfnew.to_csv(outputdir+'Sorted_reads_statistics.txt',sep='\t',index=False,header=True)
			print('\nAdded read statistics for {Sample} to table file:'.format(Sample=Sample),outputdir+'Sorted_reads_statistics.txt')
		else:
			statsdfnew=pandas.DataFrame([[Sample,len(dmfilt)]+statsPE+[ms,rd(cut,0)]],columns=tablehPE)
			statsdfnew.to_csv(outputdir+'Sorted_reads_statistics.txt',sep='\t',index=False,header=True)
			print('\nAdded read statistics for {Sample} to newly created table file:'.format(Sample=Sample),outputdir+'Sorted_reads_statistics.txt')
		if os.path.exists(outputdir+'Sorted_reads_unique_statistics.txt'):
			statsudf=pandas.read_csv(outputdir+'Sorted_reads_unique_statistics.txt',sep='\t')
			statsudfadd=pandas.DataFrame([[Sample,x[2],v[2],g[2],sr[2],srlq[2],u[2]]],columns=['Sample','TotalReads','VectorOnly','GenomeOnly','SR_Vector-Genome','SR_lowqual','Unmapped'])
			statsudfnew=statsudf.append(statsudfadd)
			statsudfnew.to_csv(outputdir+'Sorted_reads_unique_statistics.txt',sep='\t',index=False,header=True)
			print('\nAdded unique read statistics for {Sample} to table file:'.format(Sample=Sample),outputdir+'Sorted_reads_unique_statistics.txt')
		else:
			statsudfnew=pandas.DataFrame([[Sample,x[2],v[2],g[2],sr[2],srlq[2],u[2]]],columns=['Sample','TotalReads','VectorOnly','GenomeOnly','SR_Vector-Genome','SR_lowqual','Unmapped'])
			statsudfnew.to_csv(outputdir+'Sorted_reads_unique_statistics.txt',sep='\t',index=False,header=True)
			print('\nAdded unique read statistics for {Sample} to newly created table file:'.format(Sample=Sample),outputdir+'Sorted_reads_unique_statistics.txt')
	elif PE==0:
		meanstd=numpy.mean(dm['ReadCount'])+numpy.std(dm['ReadCount']) #filter integration sites to have more reads supporting them than the mean + 1 standard deviation
		if args.cutoff==0:
			cut=meanstd
		else:
			cut=args.cutoff	
		for i in range(0,len(dm)):
			if dm['ReadCount'].tolist()[i]>=cut:
				rcutoff+=1
			else:
				break
		ms=str(rd(numpy.mean(dm['ReadCount']),2))+" ± "+str(rd(numpy.std(dm['ReadCount']),2))
		print("Mean number of reads per integration site:", ms)
		print("Read cutoff for called integration sites:", rd(cut,0))
		dmfilt=dm[:rcutoff]
		tablehSE=['Sample','IntSite#','TotalReads','VectorOnly','GenomeOnly','SR_Vector-Genome','SR_lowqual','Unmapped','MeanReads/Site','ReadCutoff']
		if os.path.exists(outputdir+'Sorted_SEreads_statistics.txt'):
			statsdf=pandas.read_csv(outputdir+'Sorted_SEreads_statistics.txt',sep='\t')
			statsdfadd=pandas.DataFrame([[Sample,len(dmfilt)]+statsSE+[ms,rd(cut,0)]],columns=tablehSE)
			statsdfnew=statsdf.append(statsdfadd)
			statsdfnew.to_csv(outputdir+'Sorted_SEreads_statistics.txt',sep='\t',index=False,header=True)
			print('\nAdded read statistics for {Sample} to table file:'.format(Sample=Sample),outputdir+'Sorted_SEreads_statistics.txt')
		else:
			statsdfnew=pandas.DataFrame([[Sample,len(dmfilt)]+statsSE+[ms,rd(cut,0)]],columns=tablehSE)
			statsdfnew.to_csv(outputdir+'Sorted_SEreads_statistics.txt',sep='\t',index=False,header=True)
			print('\nAdded read statistics for {Sample} to newly created table file:'.format(Sample=Sample),outputdir+'Sorted_SEreads_statistics.txt')
	else:
		print('Missing input parameter - Specify paired end vs. merged reads')
	
	print("\nNumber of integration sites after filtering:",len(dmfilt))

	dm.to_csv(outputdir+Sample+'_IntSites_all.txt',sep='\t',index=False,header=True)
	print("\nSaved list of integration sites to file:",outputdir+Sample+'_IntSites_all.txt')
	
	dmfilt.to_csv(outputdir+Sample+'_IntSites_filtered.txt',sep='\t',index=False,header=True)
	print("\nSaved filtered integration sites to file:",outputdir+Sample+'_IntSites_filtered.txt')
	
	dmfilt.insert(loc=0,column='Sample',value=[Sample]*len(dmfilt))
	if PE==1:
		if os.path.exists(outputdir+'Allsamples_filtered_integration_sites.txt'):
			intsites=pandas.read_csv(outputdir+'Allsamples_filtered_integration_sites.txt',sep='\t')
			intsitenew=intsites.append(dmfilt)
			intsitenew.to_csv(outputdir+'Allsamples_filtered_integration_sites.txt',sep='\t',index=False,header=True)
			print('\nAdded integration sites for {Sample} to table file:'.format(Sample=Sample),outputdir+'Allsamples_filtered_integration_sites.txt')
		else:
			dmfilt.to_csv(outputdir+'Allsamples_filtered_integration_sites.txt',sep='\t',index=False,header=True)
			print('\nSaved integration sites for {Sample} to newly created table file:'.format(Sample=Sample),outputdir+'Allsamples_filtered_integration_sites.txt')
	elif PE==0:
		if os.path.exists(outputdir+'Allsamples_merged_filtered_integration_sites.txt'):
			intsites=pandas.read_csv(outputdir+'Allsamples_merged_filtered_integration_sites.txt',sep='\t')
			intsitenew=intsites.append(dmfilt)
			intsitenew.to_csv(outputdir+'Allsamples_merged_filtered_integration_sites.txt',sep='\t',index=False,header=True)
			print('\nAdded integration sites for {Sample} to table file:'.format(Sample=Sample),outputdir+'Allsamples_merged_filtered_integration_sites.txt')
		else:
			dmfilt.to_csv(outputdir+'Allsamples_merged_filtered_integration_sites.txt',sep='\t',index=False,header=True)
			print('\nSaved integration sites for {Sample} to newly created table file:'.format(Sample=Sample),outputdir+'Allsamples_merged_filtered_integration_sites.txt')	
	else:
		print('Missing input parameter - Specify paired end vs. merged reads')
	
	print("\nPipeline complete.\n")
	

if __name__ == '__main__':
   main();

