forked from kad-ecoli/python_scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsplit_fasta.py
More file actions
executable file
·97 lines (86 loc) · 3.15 KB
/
split_fasta.py
File metadata and controls
executable file
·97 lines (86 loc) · 3.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python
# 2015-11-15 Chengxin Zhang
docstring='''
split_fasta.py seq.txt seq.fasta
Split Multiple-Sequence-FASTA file "seq.txt" into single-sequence FASTA
files, whose name shall all be "seq.fasta".
Output each FASTA file into one folder, whose name is the same as the
header (sequence name)
option:
-batch_size=1
how many sequences are there in one fasta file.
if batch_size>1, split target sequence into mulitple-sequence FASTA
files of "batch_size" sequences
-exclude_list=list
exclude entries listed in "list"
'''
import sys,os
def split_fasta(infile="seq.txt",outfile="seq.fasta",outdir='.',
batch_size=1,exclude_list=''):
'''Split Multiple-Sequence-FASTA file into Single-Sequence-FASTA files.
Return a list of headers. Options:
infile - Multiple sequence fasta for all input sequences
outfile - a list containing target one sequence FASTA file name
outdir - (default: current directory)
path to the target folders
batch_size - number of sequence in each splitted FASTA file
exclude_list - a file listing entries that are not included
'''
infile=os.path.abspath(infile)
#if not outdir:
#outdir=os.path.dirname(infile)
if not os.path.isdir(outdir):
os.makedirs(outdir)
outfile_list=[outfile] if isinstance(outfile,str) else outfile
exclude_set=set()
if exclude_list and os.path.isfile(exclude_list):
fp=open(exclude_list,'rU')
exclude_set=set([line.split()[0] for line in \
fp.read().splitlines() if line.strip()])
fp.close()
header_list=[]
sequence_list=[]
for entry in open(infile,'rU').read().split('>'): # parse by entry
if not entry.strip():
continue
header=entry.strip().split()[0]
if header in exclude_set:
continue
sequence='\n'.join(entry.splitlines()[1:])
header_list.append(header)
sequence_list.append(sequence)
for idx,(header,sequence) in enumerate(zip(header_list,sequence_list)):
if idx % batch_size == 0:
if idx:
fp.close()
if not os.path.isdir(header):
os.makedirs(header)
fp=open(os.path.join(outdir,header,outfile),'w')
fp.write('>'+header+'\n'+sequence+'\n')
return header_list
if __name__=="__main__":
batch_size=1
exclude_list=''
argv=[]
for arg in sys.argv[1:]:
if arg.startswith("-batch_size="):
batch_size=int(arg[len("-batch_size="):])
elif arg.startswith("-exclude_list="):
exclude_list=arg[len("-exclude_list="):]
elif arg.startswith('-'):
sys.stderr.write("ERROR! Unknown option %s\n."%arg)
else:
argv.append(arg)
if len(argv)<1:
sys.stderr.write(docstring)
exit()
elif len(argv)<2:
outfile="seq.fasta"
else:
outfile=argv[1]
infile=argv[0]
if batch_size<1:
sys.stderr.write("ERROR! batch_size must be positive integer\n")
exit()
split_fasta(infile,outfile,
batch_size=batch_size,exclude_list=exclude_list)