import sys ,os, subprocess
import argparse

from numpy import percentile
from time import strftime

from LAFITE.reference_processing import RefProcessWrapper, short_reads_sj_import, cage_tss_import
from LAFITE.preprocessing import read_grouping, polya_signal_import, PolyAFinder
from LAFITE.utils import temp_dir_creation, bam2bed, keep_tmp_file
from LAFITE.read_collapsing import CoCoWrapper
from LAFITE.tailFinder import TailFinderWrapper
from LAFITE.refine import RefineWrapper
from LAFITE.output import OutputAssembly

gtf = '/expt/zjzace/Nanopore_subcellular/SIRV/SIRV_Set1/Raw_data/SIRV_isoforms_multi-fasta-annotation_C_170612a.gtf'
bed = '/expt/zjzace/Nanopore_subcellular/SIRV/SIRV_Set1/bam/SRR6058583.sorted.bed'
fa = '/expt/zjzace/Nanopore_subcellular/SIRV/SIRV_Set1/Raw_data/SIRV_isoforms_multi-fasta_170612a.fasta'
junction_dict, processed_read = read_grouping(bed, fa)
ref_exon, ref_junction, ref_single_exon_trans, ref_mutple_exon_trans, left_sj_set, right_sj_set, tss_dict = RefAnnotationExtraction(gtf).annotation_sorting()

# polya_dict = polya_signal_import('/expt/zjzace/Nanopore_subcellular/SIRV/SIRV_Set1/bam/SRR6058584.polya.res')
# polya_dict = PolyAFinder(processed_read, fa, '/home/zjzace/software/SQANTI3-4.1/data/polyA_motifs/mouse_and_human.polyA_motif.txt').polya_estimation()
# collected_single_exon_read, collected_multi_exon_read, tss, rss = CoCoWrapper(16, processed_read, ref_exon, ref_junction, ref_single_exon_trans, ref_mutple_exon_trans, left_sj_set, right_sj_set, junction_dict, tmp_dir='.', sj_correction_window=40, mis_intron_length = 150, corExcept_dis=4, polya_dict=polya_dict).result_collection()

class LafiteWrapper:
	def __init__(self, bam, bedtools, full_cleanup, gtf, genome, min_count_tss_tes, mis_intron_length, min_novel_trans_count, min_single_exon_coverage, min_single_exon_len, label, output, polya, polyA_motif_file, relative_abundance_threshold, sj_correction_window, short_sj_tab, thread, tss_cutoff, tss_peak):
		self.bam = bam
		self.bedtools = bedtools
		self.full_cleanup = full_cleanup
		self.gtf = gtf
		self.genome = genome
		self.min_count_tss_tes = min_count_tss_tes
		self.mis_intron_length = mis_intron_length
		self.min_novel_trans_count = min_novel_trans_count
		self.min_single_exon_coverage = min_single_exon_coverage
		self.min_single_exon_len = min_single_exon_len
		self.label = label
		self.output = output
		self.polya = polya
		self.polyA_motif_file = polyA_motif_file
		self.relative_abundance_threshold = relative_abundance_threshold
		self.sj_correction_window = sj_correction_window
		self.short_sj_tab = short_sj_tab
		self.thread = thread
		self.tss_cutoff = tss_cutoff
		self.tss_peak = tss_peak
		
	
	def revisit_parameter(self):
		"""
		revisit input parameters"""

		sys.stdout.write(f'\nInput parameters:\n')
		sys.stdout.write(f'Read alignment file: {self.bam}\n')
		sys.stdout.write(f'Reference gene annotation: {self.gtf}\n')
		sys.stdout.write(f'Reference genome annotation: {self.genome}\n')
		if self.polya:
			sys.stdout.write(f'Reads Polyadenylation events: {self.polya}\n')
		elif self.polyA_motif_file:
			sys.stdout.write(f'PolyA motif file for read Polyadenylation event estimation: {self.polyA_motif_file}\n')
		else:
			raise ValueError('Fatal: Please provide either polyA motif file or reads polyadenylation event result\n')
		sys.stdout.write(f'Edit distance to reference splicing site allowed for splicing correction: {self.sj_correction_window}\n')
		sys.stdout.write(f'Output assembly file: {self.output}\n')

	def run_lafite(self):
		"""
		LAFITE wrapper"""

		self.revisit_parameter()
		# create temp directory for log and intermediate files
		try:
			tmp_folder = temp_dir_creation(os.path.dirname(self.output))
			tmp_dir = tmp_folder.name
		except:
			raise ValueError('Fatal: Please provide a valid path for output files\n')

		# reference gene annotation processing
		sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ': Preprocessing reference gene annotation\n')
		ref_exon, ref_junction, ref_single_exon_trans, ref_mutple_exon_trans, left_sj_set, right_sj_set, tss_dict = RefProcessWrapper(self.gtf, self.thread).result_collection()

		if self.short_sj_tab:
			left_sj_set, right_sj_set = short_reads_sj_import(self.short_sj_tab, left_sj_set, right_sj_set)
		
		if self.tss_peak:
			tss_dict = cage_tss_import(self.tss_peak, tss_dict)

		# processing alignment bam file
		## convert alignment bam file to bed12 format
		sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ': Preprocessing alignment file\n')
		try: 
			bam2bed_cmd = bam2bed(self.bam, tmp_dir, self.bedtools)
		except:
			raise ValueError('Fatal: Please provide a valid path for bam file and bedtools\n')
		p = subprocess.run(bam2bed_cmd, shell=True)
		if p.returncode == 0:
			pass
		else:
			raise ValueError('Fatal: Error in bam conversion\n')

		## read grouping according to chromosome and strand
		outbed = os.path.join(tmp_dir, 'bam.bed')
		junction_dict, processed_read = read_grouping(outbed, self.genome)

		# polyA info import
		if self.polya:
			sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ': Loading polyA information\n')
			polya_dict = polya_signal_import(self.polya)
		else:
			sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ': No reads Polyadenylation event provided, detecting from sequence\n')
			polya_dict = PolyAFinder(processed_read, self.genome, self.polyA_motif_file).polya_estimation()

		# read correction and collapsing
		sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ': Collapssing corrected reads\n')
		collected_single_exon_read, collected_multi_exon_read, collected_rss, collected_res = CoCoWrapper(self.thread, processed_read, ref_exon, ref_junction, ref_single_exon_trans, ref_mutple_exon_trans, left_sj_set, right_sj_set, junction_dict, self.sj_correction_window, polya_dict, self.mis_intron_length, tmp_dir).result_collection()

		# identify putative TSS and TES for collapsed reads
		processed_collected_multi_exon_read, three_prime_exon = TailFinderWrapper(collected_multi_exon_read, self.min_count_tss_tes, self.thread).result_collection()

		# calculating the tss_cutoff and tes_cutoff:
		if not self.tss_cutoff:
			self.tss_cutoff = percentile(collected_rss, 75)
		tes_cutoff = percentile(collected_res, 70)
		print(f'TSS cutoff: {self.tss_cutoff}')
		print(f'TES cutoff: {tes_cutoff}')

		# identify high quality isoforms from collapsed reads
		sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ': Revisiting the collapsed reads to get high-concensus full-length isoforms\n')
		collected_refined_isoforms = RefineWrapper(processed_collected_multi_exon_read, collected_single_exon_read, ref_mutple_exon_trans, ref_single_exon_trans, three_prime_exon, tss_dict, self.tss_cutoff, tes_cutoff, self.min_novel_trans_count, self.min_single_exon_coverage, self.min_single_exon_len, self.thread, tmp_dir).result_collection()

		# output refined isoforms
		OutputAssembly(collected_refined_isoforms, self.output, self.label, self.relative_abundance_threshold).write_out()

		# keep intermediate results
		if self.full_cleanup:
			os.system(keep_tmp_file(self.output, tmp_dir))
		
		sys.stdout.write(strftime("%Y-%m-%d %H:%M:%S") + ': All done!\n')

def main():

	parser = argparse.ArgumentParser(description='Low-abundance Aware Full-length Isoform clusTEr')
	parser.add_argument('-b', dest='bam', help='path to the alignment file in bam format', required=True)
	parser.add_argument('-B', dest='bedtools', type=str, default='bedtools', help='path to the executable bedtools')
	parser.add_argument('-g', dest='gtf', help='path to the reference gene annotation in GTF format', required=True)
	parser.add_argument('-f', dest='genome', help='path to the reference genome fasta', required=True)
	parser.add_argument('-o', dest='output', help='path to the output file', required=True)
	parser.add_argument('-n', dest='min_count_tss_tes', type=int, default=3, help='minimum number of reads supporting a alternative TSS or TES, default: 3')
	parser.add_argument('-i', dest='mis_intron_length', type=int, default=150, help='length cutoff for correcting unexpected small intron, default: 150')
	parser.add_argument('-c', dest='min_novel_trans_count', type=int, default=3, help='minimum occurrences required for a isoform from novel loci, default: 3')
	parser.add_argument('-s', dest='min_single_exon_coverage', type=int, default=4, help='minimum read coverage required for a novel single-exon transcript, default: 4')
	parser.add_argument('-l', dest='min_single_exon_len', type=int, default=100, help='minimum length for single-exon transcript, default: 100')
	parser.add_argument('-L', dest='label', type=str, default='LAFT', help='name prefix for output transcripts, default: LAFT')
	parser.add_argument('-p', dest='polya', help='path to the file contains read Polyadenylation event')
	parser.add_argument('-m', dest='polyA_motif_file', help='path to the polya motif file')
	parser.add_argument('-r', dest='relative_abundance_threshold', type=int, default=0.01, help='minimum abundance of the predicted multi-exon transcripts as a fraction of the total transcript assembled at a given locus, default: 0.01')
	parser.add_argument('-j', dest='short_sj_tab', default=None, help='path to the short read splice junction file')
	parser.add_argument('-w', dest='sj_correction_window', type=int, default=40, help='edit distance to reference splicing site for splicing correction, default: 40')
	parser.add_argument('--no_full_cleanup', dest='full_cleanup', action='store_true', help='keep all intermediate files')
	parser.add_argument('-t', dest='thread', type=int, default=4, help='number of the threads, default: 4')
	parser.add_argument('-T', dest='tss_peak', default=None, help='path to the TSS peak file')
	parser.add_argument('-d', dest='tss_cutoff', type=int, default=None, help='minimum TSS distance for a transcript to be considered as a novel transcript')
	args = parser.parse_args()

	LafiteWrapper(args.bam, args.bedtools, args.full_cleanup, args.gtf, args.genome, args.min_count_tss_tes, args.mis_intron_length, args.min_novel_trans_count, args.min_single_exon_coverage, args.min_single_exon_len, args.label, args.output, args.polya, args.polyA_motif_file, args.relative_abundance_threshold, args.sj_correction_window, args.short_sj_tab, args.thread, args.tss_cutoff, args.tss_peak).run_lafite()

lafite_wrapper

`class` `LafiteWrapper`[source]

`main`[source]

lafite_wrapper

class LafiteWrapper[source]

main[source]

`class` `LafiteWrapper`[source]

`main`[source]