Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 423114f

Browse filesBrowse files
committed
batch_retrieval faster header search by grep,sed,cut
1 parent 036a24d commit 423114f
Copy full SHA for 423114f

File tree

Expand file treeCollapse file tree

1 file changed

+33
-13
lines changed
Open diff view settings
Filter options
Expand file treeCollapse file tree

1 file changed

+33
-13
lines changed
Open diff view settings
Collapse file

‎batch_retrieval.py‎

Copy file name to clipboardExpand all lines: batch_retrieval.py
+33-13Lines changed: 33 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@
1818
-db=uniprot_sprot.fasta.gz
1919
(only valid if -outfmt=fasta)
2020
directly extract fasta sequence from FASTA format sequence database
21+
only accession as will be preserved as fasta header
2122
'''
2223
import urllib,urllib2
2324
import sys,os
25+
import subprocess
2426

2527
url_mapping = 'http://www.uniprot.org/mapping/'
2628
url_upload = 'http://www.uniprot.org/uploadlists/'
@@ -70,18 +72,28 @@ def batch_retrival(query_list=[],infmt="ACC",outfmt="fasta",db=''):
7072
page = response.read()
7173
return page
7274

73-
def remove_entry_already_fetch(query_list,outfile):
74-
'''remove entries already outfile'''
75-
already_fetch_list=[]
76-
fp=open(outfile,'rU')
77-
for line in fp.read().splitlines():
78-
if line.startswith('>'):
79-
line=line.split()[0].lstrip('>')
80-
if '|' in line:
81-
line=line.split('|')[1]
82-
already_fetch_list.append(line)
83-
fp.close()
84-
query_list=[line for line in query_list if not line in already_fetch_list]
75+
def get_accession_list_from_fasta(fasta_file):
76+
'''get a list of accessions from fasta.
77+
This is faster than a pure python implement.
78+
'''
79+
if not os.path.isfile(fasta_file):
80+
return []
81+
cmd='z'*fasta_file.endswith(".gz")+ "grep '>' %s|"%fasta_file+ \
82+
"sed 's/^>[a-z]\{1,\}|//g'|sed 's/^>//g'|cut -d' ' -f1|cut -d'|' -f1"
83+
p=subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
84+
stdout,stderr=p.communicate()
85+
return stdout.splitlines()
86+
87+
def remove_entry_not_in_fasta_file(query_list,fasta_file):
88+
'''remove entries not in fasta file'''
89+
fasta_header_list=get_accession_list_from_fasta(fasta_file)
90+
query_list=list(set(query_list).intersection(fasta_header_list))
91+
return query_list
92+
93+
def remove_entry_in_fasta_file(query_list,fasta_file):
94+
'''remove entries in fasta file'''
95+
fasta_header_list=get_accession_list_from_fasta(fasta_file)
96+
query_list=list(set(query_list).difference(fasta_header_list))
8597
return query_list
8698

8799
if __name__=="__main__":
@@ -115,9 +127,17 @@ def remove_entry_already_fetch(query_list,outfile):
115127
fp=open(argv[0],'rU')
116128
query_list=fp.read().splitlines()
117129
fp.close()
130+
sys.stdout.write("%u query entries\n"%len(query_list))
131+
132+
if db:
133+
if infmt=="ACC":
134+
query_list=remove_entry_not_in_fasta_file(query_list,db)
135+
sys.stdout.write("%u query entries in db\n"%len(query_list))
118136

119137
if outfmt=="fasta":
120-
query_list=remove_entry_already_fetch(query_list,argv[1])
138+
if infmt=="ACC":
139+
query_list=remove_entry_in_fasta_file(query_list,argv[1])
140+
sys.stdout.write("%u query entries not fetched\n"%len(query_list))
121141
fp=open(argv[1],'a')
122142
else:
123143
fp=open(argv[1],'w')

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.