Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Latest commit

 

History

History
History
executable file
·
97 lines (86 loc) · 3.15 KB

File metadata and controls

executable file
·
97 lines (86 loc) · 3.15 KB
Copy raw file
Download raw file
Open symbols panel
Edit and raw actions
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python
# 2015-11-15 Chengxin Zhang
docstring='''
split_fasta.py seq.txt seq.fasta
Split Multiple-Sequence-FASTA file "seq.txt" into single-sequence FASTA
files, whose name shall all be "seq.fasta".
Output each FASTA file into one folder, whose name is the same as the
header (sequence name)
option:
-batch_size=1
how many sequences are there in one fasta file.
if batch_size>1, split target sequence into mulitple-sequence FASTA
files of "batch_size" sequences
-exclude_list=list
exclude entries listed in "list"
'''
import sys,os
def split_fasta(infile="seq.txt",outfile="seq.fasta",outdir='.',
batch_size=1,exclude_list=''):
'''Split Multiple-Sequence-FASTA file into Single-Sequence-FASTA files.
Return a list of headers. Options:
infile - Multiple sequence fasta for all input sequences
outfile - a list containing target one sequence FASTA file name
outdir - (default: current directory)
path to the target folders
batch_size - number of sequence in each splitted FASTA file
exclude_list - a file listing entries that are not included
'''
infile=os.path.abspath(infile)
#if not outdir:
#outdir=os.path.dirname(infile)
if not os.path.isdir(outdir):
os.makedirs(outdir)
outfile_list=[outfile] if isinstance(outfile,str) else outfile
exclude_set=set()
if exclude_list and os.path.isfile(exclude_list):
fp=open(exclude_list,'rU')
exclude_set=set([line.split()[0] for line in \
fp.read().splitlines() if line.strip()])
fp.close()
header_list=[]
sequence_list=[]
for entry in open(infile,'rU').read().split('>'): # parse by entry
if not entry.strip():
continue
header=entry.strip().split()[0]
if header in exclude_set:
continue
sequence='\n'.join(entry.splitlines()[1:])
header_list.append(header)
sequence_list.append(sequence)
for idx,(header,sequence) in enumerate(zip(header_list,sequence_list)):
if idx % batch_size == 0:
if idx:
fp.close()
if not os.path.isdir(header):
os.makedirs(header)
fp=open(os.path.join(outdir,header,outfile),'w')
fp.write('>'+header+'\n'+sequence+'\n')
return header_list
if __name__=="__main__":
batch_size=1
exclude_list=''
argv=[]
for arg in sys.argv[1:]:
if arg.startswith("-batch_size="):
batch_size=int(arg[len("-batch_size="):])
elif arg.startswith("-exclude_list="):
exclude_list=arg[len("-exclude_list="):]
elif arg.startswith('-'):
sys.stderr.write("ERROR! Unknown option %s\n."%arg)
else:
argv.append(arg)
if len(argv)<1:
sys.stderr.write(docstring)
exit()
elif len(argv)<2:
outfile="seq.fasta"
else:
outfile=argv[1]
infile=argv[0]
if batch_size<1:
sys.stderr.write("ERROR! batch_size must be positive integer\n")
exit()
split_fasta(infile,outfile,
batch_size=batch_size,exclude_list=exclude_list)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.