1818 -db=uniprot_sprot.fasta.gz
1919 (only valid if -outfmt=fasta)
2020 directly extract fasta sequence from FASTA format sequence database
21+ only accession as will be preserved as fasta header
2122'''
2223import urllib ,urllib2
2324import sys ,os
25+ import subprocess
2426
2527url_mapping = 'http://www.uniprot.org/mapping/'
2628url_upload = 'http://www.uniprot.org/uploadlists/'
@@ -70,18 +72,28 @@ def batch_retrival(query_list=[],infmt="ACC",outfmt="fasta",db=''):
7072 page = response .read ()
7173 return page
7274
73- def remove_entry_already_fetch (query_list ,outfile ):
74- '''remove entries already outfile'''
75- already_fetch_list = []
76- fp = open (outfile ,'rU' )
77- for line in fp .read ().splitlines ():
78- if line .startswith ('>' ):
79- line = line .split ()[0 ].lstrip ('>' )
80- if '|' in line :
81- line = line .split ('|' )[1 ]
82- already_fetch_list .append (line )
83- fp .close ()
84- query_list = [line for line in query_list if not line in already_fetch_list ]
75+ def get_accession_list_from_fasta (fasta_file ):
76+ '''get a list of accessions from fasta.
77+ This is faster than a pure python implement.
78+ '''
79+ if not os .path .isfile (fasta_file ):
80+ return []
81+ cmd = 'z' * fasta_file .endswith (".gz" )+ "grep '>' %s|" % fasta_file + \
82+ "sed 's/^>[a-z]\{1,\}|//g'|sed 's/^>//g'|cut -d' ' -f1|cut -d'|' -f1"
83+ p = subprocess .Popen (cmd ,shell = True ,stdout = subprocess .PIPE )
84+ stdout ,stderr = p .communicate ()
85+ return stdout .splitlines ()
86+
87+ def remove_entry_not_in_fasta_file (query_list ,fasta_file ):
88+ '''remove entries not in fasta file'''
89+ fasta_header_list = get_accession_list_from_fasta (fasta_file )
90+ query_list = list (set (query_list ).intersection (fasta_header_list ))
91+ return query_list
92+
93+ def remove_entry_in_fasta_file (query_list ,fasta_file ):
94+ '''remove entries in fasta file'''
95+ fasta_header_list = get_accession_list_from_fasta (fasta_file )
96+ query_list = list (set (query_list ).difference (fasta_header_list ))
8597 return query_list
8698
8799if __name__ == "__main__" :
@@ -115,9 +127,17 @@ def remove_entry_already_fetch(query_list,outfile):
115127 fp = open (argv [0 ],'rU' )
116128 query_list = fp .read ().splitlines ()
117129 fp .close ()
130+ sys .stdout .write ("%u query entries\n " % len (query_list ))
131+
132+ if db :
133+ if infmt == "ACC" :
134+ query_list = remove_entry_not_in_fasta_file (query_list ,db )
135+ sys .stdout .write ("%u query entries in db\n " % len (query_list ))
118136
119137 if outfmt == "fasta" :
120- query_list = remove_entry_already_fetch (query_list ,argv [1 ])
138+ if infmt == "ACC" :
139+ query_list = remove_entry_in_fasta_file (query_list ,argv [1 ])
140+ sys .stdout .write ("%u query entries not fetched\n " % len (query_list ))
121141 fp = open (argv [1 ],'a' )
122142 else :
123143 fp = open (argv [1 ],'w' )
0 commit comments