From 61632d611aceac8592977fdd6e660513b60fdd0f Mon Sep 17 00:00:00 2001 From: Chengxin Zhang Date: Sun, 8 Apr 2018 17:55:02 -0400 Subject: [PATCH 01/32] fix bug in pgen template sequence --- initdat2pdb.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/initdat2pdb.py b/initdat2pdb.py index 537b1ed..7854cb4 100755 --- a/initdat2pdb.py +++ b/initdat2pdb.py @@ -156,7 +156,7 @@ def convert_initdat_txt(sequence='',initdat_txt='',CONECT=False): template_segment=line[54:59]+ \ code_with_modified_residues[line[60:63]]+" " \ - if len(line)>=63 else " " + if line[60:63].strip() else " " target_segment=line[21:26]+ \ residue_name_short+" " #code_with_modified_residues[line[17:20]]+" " @@ -216,7 +216,7 @@ def convert_initdat_txt(sequence='',initdat_txt='',CONECT=False): sequence=sequence[:resi]+residue_name_short+sequence[resi+1:] template_sequence+='-'*(resi-len(template_sequence)) - if len(line)>=63: + if line[60:63].strip(): template_sequence=template_sequence[:resi]+ \ code_with_modified_residues[line[60:63]]+template_sequence[resi+1:] else: From f1e7e9f4edb0b3bf0fe99d8e8dd46da4278788c5 Mon Sep 17 00:00:00 2001 From: Chengxin Zhang Date: Wed, 9 May 2018 21:44:14 -0400 Subject: [PATCH 02/32] batch retrieve GO term annotations --- batch_retrieval.py | 90 +++++++++++++++++++++++++++++++++------------- 1 file changed, 65 insertions(+), 25 deletions(-) diff --git a/batch_retrieval.py b/batch_retrieval.py index b1605fd..e878893 100755 --- a/batch_retrieval.py +++ b/batch_retrieval.py @@ -5,11 +5,15 @@ output fetch result to "uniprot.fasta" Options: - -outfmt={fasta,txt,tab,xml,rdf,list,html} + -outfmt={fasta,txt,tab,xml,rdf,list,html,gaf,gpad,tsv} output format. default "fasta" If output format is "fasta", update output fetch result by adding new entries into output fetch result. - Otherwise, overwrite output fetch result + Otherwise, overwrite output fetch result. + + If output format is gaf, gpad or tsv, retrieve the Gene Ontology + for respective entries. In this case, -infmt must be ACC. + -db is ignored. -infmt={ACC+ID,ACC,ID,UPARC,NF50,NF90,NF100,GENENAME,...} input format. default 'ACC' (uniprot accession). See @@ -32,6 +36,11 @@ # list will be splitted into small lists of "split_size" entries split_size=20000 +from string import Template +import requests + +requestURL = "https://www.ebi.ac.uk/QuickGO/services/annotation/downloadSearch?limit=100&geneProductId=UniProtKB:" + def batch_retrival(query_list=[],infmt="ACC",outfmt="fasta",db=''): ''' If "db" is empty, batch retrieve uniprot entries listed in @@ -96,6 +105,25 @@ def remove_entry_in_fasta_file(query_list,fasta_file): query_list=list(set(query_list).difference(fasta_header_list)) return query_list +def request_GO(query,outfmt="gaf"): + ''' request gpad or gaf format GO annotation ''' + page='' + txt='' + p=1 + while p: + r = requests.get("%s%s&page=%d"%(requestURL,query,p), + headers={ "Accept" : "text/"+outfmt}) + if not r.ok: + r.raise_for_status() + responseBody = ''.join([line+'\n' for line in str(r.text + ).splitlines() if line.startswith("UniProtKB")]) + if responseBody==txt: + break + txt=str(responseBody) + page+=responseBody + p+=1 + return page + if __name__=="__main__": infmt="ACC" # input uniprot accessions outfmt="fasta" # output fasta sequences @@ -129,28 +157,40 @@ def remove_entry_in_fasta_file(query_list,fasta_file): fp.close() sys.stdout.write("%u query entries\n"%len(query_list)) - if db: - if infmt=="ACC": - query_list=remove_entry_not_in_fasta_file(query_list,db) - sys.stdout.write("%u query entries in db\n"%len(query_list)) - - if outfmt=="fasta": - if infmt=="ACC": - query_list=remove_entry_in_fasta_file(query_list,argv[1]) - sys.stdout.write("%u query entries not fetched\n"%len(query_list)) - fp=open(argv[1],'a') - else: + if outfmt in ["gaf","gpad","tsv"]: + if infmt!="ACC": + sys.stderr.write("ERROR! -infmt must be 'ACC' when -outfmt=gaf") + exit() + if db: + sys.stderr.write("WARNING! ignore -db because -outfmt=gaf") fp=open(argv[1],'w') - for i in range(0,len(query_list),split_size): - sys.stdout.write("Retrieving entries %u-%u\n"%( - i+1,min(i+split_size,len(query_list)))) - while (True): # keep trying to retrieve entry until success - try: - page=batch_retrival(query_list[i:i+split_size], - infmt,outfmt,db) - fp.write(page) - fp.flush() - break - except Exception,error: - sys.stderr.write(str(error)+'\n') + for query in query_list: + page=request_GO(query,outfmt) + fp.write(page) + fp.flush() + else: + if db: + if infmt=="ACC": + query_list=remove_entry_not_in_fasta_file(query_list,db) + sys.stdout.write("%u query entries in db\n"%len(query_list)) + + if outfmt=="fasta": + if infmt=="ACC": + query_list=remove_entry_in_fasta_file(query_list,argv[1]) + sys.stdout.write("%u query entries not fetched\n"%len(query_list)) + fp=open(argv[1],'a') + else: + fp=open(argv[1],'w') + for i in range(0,len(query_list),split_size): + sys.stdout.write("Retrieving entries %u-%u\n"%( + i+1,min(i+split_size,len(query_list)))) + while (True): # keep trying to retrieve entry until success + try: + page=batch_retrival(query_list[i:i+split_size], + infmt,outfmt,db) + fp.write(page) + fp.flush() + break + except Exception,error: + sys.stderr.write(str(error)+'\n') fp.close() From a389ac62a2c6db9e4b9bd53edc1e2de05e1bd648 Mon Sep 17 00:00:00 2001 From: Chengxin Zhang Date: Sat, 21 Jul 2018 11:51:35 -0400 Subject: [PATCH 03/32] contact_pdb can read from stdin --- contact_pdb.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/contact_pdb.py b/contact_pdb.py index 3fcbefb..5fb9027 100755 --- a/contact_pdb.py +++ b/contact_pdb.py @@ -82,7 +82,7 @@ def read_contact_map(infile="contact.map", resi1=[] # residue index 1 list resi2=[] # residue index 2 list p=[] # cscore of contact prediction accuracy list - fp=open(infile,'rU') + fp=open(infile,'rU') if infile!='-' else sys.stdin lines=fp.read().strip().splitlines() fp.close() pattern=re.compile('(^\d+\s+\d+\s+\d+\s+\d+\s+[-+.e\d]+)|(^\d+\s+\d+\s+[-+.e\d]+)|(^\d+\s+[A-Z]\s+\d+\s+[A-Z]\s+[-+.e\d]+\s+[-+.e\d]+)') @@ -152,7 +152,7 @@ def calc_res_dist(infile="pdb.pdb",atom_sele="CA"): atom_sele - select atoms whose euclidean distances are to be calculated "CA" for alpha carbon "CB" for alpha carbon in gly in beta carbon in all other amino acids''' - fp=open(infile,'rU') + fp=open(infile,'rU') if infile!='-' else sys.stdin struct=fp.read().split("ENDMDL")[0] # first model only fp.close() @@ -332,6 +332,7 @@ def calc_res_contact(res_dist_list,sep_range=str(short_range_def),cutoff=8): cutoff_long =0 offset =0 infmt="rr" + file_list=[] for arg in sys.argv[1:]: if arg.startswith("-cutoff="): cutoff=float(arg[len("-cutoff="):]) @@ -355,10 +356,10 @@ def calc_res_contact(res_dist_list,sep_range=str(short_range_def),cutoff=8): cutoff_long=float(arg[len("-cutoff_long="):]) elif arg.startswith("-offset="): offset=int(arg[len("-offset="):]) - elif arg.startswith("-"): + elif arg.startswith("-") and len(arg)>1: sys.stderr.write("ERROR! Unknown argument %s\n"%arg) exit() - file_list=[arg for arg in sys.argv[1:] if not arg.startswith("-")] + file_list.append(arg) if not file_list: sys.stderr.write(docstring+"\nERROR! No PDB file") exit() @@ -423,4 +424,4 @@ def calc_res_contact(res_dist_list,sep_range=str(short_range_def),cutoff=8): else: - sys.stderr,write(docstring+"ERROR! Too many arguments.\n") + sys.stderr.write(docstring+"ERROR! Too many arguments.\n") From a40029c17855a78add3f772832e39a61510a608b Mon Sep 17 00:00:00 2001 From: zcx Date: Fri, 27 Jul 2018 21:20:07 -0400 Subject: [PATCH 04/32] contact_pdb can read gzip file --- contact_pdb.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/contact_pdb.py b/contact_pdb.py index 5fb9027..d64d2b9 100755 --- a/contact_pdb.py +++ b/contact_pdb.py @@ -64,6 +64,7 @@ ''' import sys,os import re +import gzip # see http://predictioncenter.org/casp12/doc/rr_help.html for contact range # defination. Note that NeBcon/NN-BAYES uses different defination for long @@ -82,7 +83,12 @@ def read_contact_map(infile="contact.map", resi1=[] # residue index 1 list resi2=[] # residue index 2 list p=[] # cscore of contact prediction accuracy list - fp=open(infile,'rU') if infile!='-' else sys.stdin + fp=sys.stdin + if infile!='-': + if infile.endswith(".gz"): + fp=gzip.open(infile,'rU') + else: + fp=open(infile,'rU') lines=fp.read().strip().splitlines() fp.close() pattern=re.compile('(^\d+\s+\d+\s+\d+\s+\d+\s+[-+.e\d]+)|(^\d+\s+\d+\s+[-+.e\d]+)|(^\d+\s+[A-Z]\s+\d+\s+[A-Z]\s+[-+.e\d]+\s+[-+.e\d]+)') @@ -152,7 +158,12 @@ def calc_res_dist(infile="pdb.pdb",atom_sele="CA"): atom_sele - select atoms whose euclidean distances are to be calculated "CA" for alpha carbon "CB" for alpha carbon in gly in beta carbon in all other amino acids''' - fp=open(infile,'rU') if infile!='-' else sys.stdin + fp=sys.stdin + if infile!='-': + if infile.endswith(".gz"): + fp=gzip.open(infile,'rU') + else: + fp=open(infile,'rU') struct=fp.read().split("ENDMDL")[0] # first model only fp.close() From 001594d11b8ca2d03134cd5287d4a935f4769bae Mon Sep 17 00:00:00 2001 From: zcx Date: Sun, 29 Jul 2018 09:57:01 -0400 Subject: [PATCH 05/32] fix bug in argument parsing and add -outfmt=lnat in contact_pdb --- contact_pdb.py | 55 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 3 deletions(-) diff --git a/contact_pdb.py b/contact_pdb.py index d64d2b9..7634d46 100755 --- a/contact_pdb.py +++ b/contact_pdb.py @@ -51,6 +51,9 @@ contact ACC for top L/2, L=protein length) #short1 short2 short5 medm1 medm2 medm5 long1 long2 long5 all1 all2 all5 + "lnat": statistics on accuracy (ACC): (contact ACC for top Lnative, + where Lnative is the number of native contacts) + #short medm long all -cutoff_all=0 # ignore contact prediction p<=0 -cutoff_short=0.5 # ignore short range contact prediction p<=0.5 @@ -248,6 +251,37 @@ def compare_res_contact(res_dist_list,res_pred_list,cutoff=8): cmp_list=zip(resi1,resi2,dist,contact,p) return cmp_list +def calc_lnat_acc_contact(cmp_list,con_num_dict,sep_range=str(short_range_def)): + '''Calculate residue contact accuracy using ouput if "compare_res_contact" + and native contact number diction "con_num_dict" ''' + top_pred=dict() # top short, medm, long, all prediction + + if not sep_range in ["medium","long"]: + top_pred["short"]=[res_pair for res_pair in cmp_list if \ + short_range_def<=abs(res_pair[0]-res_pair[1])1: sys.stderr.write("ERROR! Unknown argument %s\n"%arg) exit() - file_list.append(arg) + else: + file_list.append(arg) if not file_list: sys.stderr.write(docstring+"\nERROR! No PDB file") exit() @@ -388,7 +423,7 @@ def calc_res_contact(res_dist_list,sep_range=str(short_range_def),cutoff=8): sys.stdout.write("%d\t%d\t%.1f\n"%(res_pair[0],res_pair[1],res_pair[2])) elif outfmt=="list": sys.stdout.write("%d\t%d\n"%(res_pair[0],res_pair[1])) - if outfmt=="stat": + if outfmt.startswith("stat") or outfmt.startswith("lnat"): con_num_dict=calc_contact_num(res_con_list,L) key_list=["short","medm","long","all","L"] sys.stderr.write('\t'.join(key_list)+'\n') @@ -405,7 +440,7 @@ def calc_res_contact(res_dist_list,sep_range=str(short_range_def),cutoff=8): sep_range,offset,infmt) cmp_list=compare_res_contact(res_dist_list,res_pred_list,cutoff) - if not outfmt.startswith("stat"): + if not outfmt.startswith("stat") and not outfmt.startswith("lnat"): for res_pair in cmp_list: #resi1,resi2,dist,contact,p if outfmt.startswith("dist"): sys.stdout.write("%d\t%d\t%.1f\t%s\t%.3f\n"%(res_pair[0], @@ -413,6 +448,20 @@ def calc_res_contact(res_dist_list,sep_range=str(short_range_def),cutoff=8): elif outfmt=="list": sys.stdout.write("%d\t%d\t%s\t%.3f\n"%(res_pair[0], res_pair[1],res_pair[3],res_pair[4])) + elif outfmt.startswith("lnat"): + con_num_dict=calc_contact_num(res_con_list,L) + ACC,top_pred=calc_lnat_acc_contact(cmp_list,con_num_dict,sep_range) + if sep_range == "short": + key_list=["short"] + elif sep_range == "medium": + key_list=["medm"] + elif sep_range == "long": + key_list=["long"] + else: + key_list=["short", "medm", "long", "all"] + sys.stderr.write('\t'.join(key_list)+'\n') + sys.stdout.write('\t'.join(['%.3f'%ACC[key] for key in key_list] + )+'\n') else: # outfmt=="stat" ACC,top_pred=calc_acc_contact(cmp_list,L,sep_range) if sep_range == "short": From ce525a34491f70b31b806c8ec0ad2e03ec7d0d62 Mon Sep 17 00:00:00 2001 From: zcx Date: Sun, 29 Jul 2018 11:44:08 -0400 Subject: [PATCH 06/32] add -infmt=pdb for contact_pdb --- contact_pdb.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/contact_pdb.py b/contact_pdb.py index 7634d46..77f1a9c 100755 --- a/contact_pdb.py +++ b/contact_pdb.py @@ -61,9 +61,10 @@ -cutoff_long=0.3 # ignore medium range contact prediction p<=0.3 -offset=0 add "offset" to residue index in predicted contact map - -infmt={rr,gremlin} input format of contact map - rr - CASP RR, NeBcon, or mfDCA format + -infmt={rr,gremlin,pdb} input format of contact map + rr - CASP RR, NeBcon, or mfDCA format gremlin - matrix of confidence score + pdb - pdb coordinate file ''' import sys,os import re @@ -76,6 +77,18 @@ medm_range_def=12 # medm_range_def <= separation < long_range_def long_range_def=24 # long_range_def <= separation. 25 in NeBcon/NN-BAYES +def read_pseudo_contact_map(infile="model1.pdb",atom_sele="CB", + cutoff=8, sep_range=str(short_range_def),offset=0): + res_dist_list=calc_res_dist(infile,atom_sele) + res_con_list=calc_res_contact(res_dist_list,sep_range,cutoff) + resi1,resi2,p=map(list,zip(*res_con_list)) + for i in range(len(res_con_list)): + resi1[i]+=offset + resi2[i]+=offset + p[i]=1-1.*p[i]/cutoff + p,resi1,resi2=map(list,zip(*sorted(zip(p,resi1,resi2),reverse=True))) + return zip(resi1,resi2,p) + def read_contact_map(infile="contact.map", cutoff_all=0,cutoff_short=0,cutoff_medium=0,cutoff_long=0, sep_range=str(short_range_def),offset=0,infmt="rr"): @@ -435,9 +448,13 @@ def calc_res_contact(res_dist_list,sep_range=str(short_range_def),cutoff=8): elif len(file_list)==2: # calculate contact prediction accuracy - res_pred_list=read_contact_map(file_list[1], - cutoff_all,cutoff_short,cutoff_medium,cutoff_long, - sep_range,offset,infmt) + if infmt!="pdb": + res_pred_list=read_contact_map(file_list[1], + cutoff_all,cutoff_short,cutoff_medium,cutoff_long, + sep_range,offset,infmt) + else: + res_pred_list=read_pseudo_contact_map(file_list[1],atom_sele, + cutoff, sep_range, offset) cmp_list=compare_res_contact(res_dist_list,res_pred_list,cutoff) if not outfmt.startswith("stat") and not outfmt.startswith("lnat"): From 6245200f3b6f52df846ad7df6ff6e0d09e585c4b Mon Sep 17 00:00:00 2001 From: chengxin Date: Mon, 26 Nov 2018 16:24:40 -0500 Subject: [PATCH 07/32] fix undefined GO term issue in create_UNIPROT_GOterms.py --- create_UNIPROT_GOterms.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/create_UNIPROT_GOterms.py b/create_UNIPROT_GOterms.py index bc0965a..0db27d4 100755 --- a/create_UNIPROT_GOterms.py +++ b/create_UNIPROT_GOterms.py @@ -270,6 +270,8 @@ def create_UNIPROT_GOterms(uniprot_list=[],GOA_dict=dict(), GOterms_Aspect[Aspect]=GOA_dict[Aspect][p] if obo_dict: # trace back all parent nodes for Term_id in set(GOterms_Aspect[Aspect]): + if not Term_id in obo_dict[Aspect]["Term"]: + continue GOterms_Aspect[Aspect]+=obo_dict.is_a(Term_id, direct=False,name=False,number=False).split() GOterms_Aspect[Aspect]=sorted(set(GOterms_Aspect[Aspect])-excludeGO) From cbaeff5978ca97c7ef30128bb9b45663e53de87a Mon Sep 17 00:00:00 2001 From: Chengxin Zhang Date: Wed, 20 Feb 2019 20:00:30 -0500 Subject: [PATCH 08/32] fix empty contact map bug --- contact_pdb.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/contact_pdb.py b/contact_pdb.py index 77f1a9c..b38d6a2 100755 --- a/contact_pdb.py +++ b/contact_pdb.py @@ -81,6 +81,8 @@ def read_pseudo_contact_map(infile="model1.pdb",atom_sele="CB", cutoff=8, sep_range=str(short_range_def),offset=0): res_dist_list=calc_res_dist(infile,atom_sele) res_con_list=calc_res_contact(res_dist_list,sep_range,cutoff) + if len(res_con_list)==0: + return zip([],[],[]) resi1,resi2,p=map(list,zip(*res_con_list)) for i in range(len(res_con_list)): resi1[i]+=offset From 49b86853c2b55b4e42b86aa46f0ca77e53762eb4 Mon Sep 17 00:00:00 2001 From: Chengxin Zhang Date: Wed, 20 Feb 2019 20:00:56 -0500 Subject: [PATCH 09/32] fix bug in negative residue index --- reindex_pdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reindex_pdb.py b/reindex_pdb.py index 2c255e4..a0b34e1 100755 --- a/reindex_pdb.py +++ b/reindex_pdb.py @@ -154,7 +154,7 @@ def reindex_pdb(startindex,infile,clean=True): for arg in sys.argv[1:]: if arg.startswith("-clean="): clean=(arg[len("-clean="):].lower()=="true") - elif arg.startswith('-'): + elif arg.startswith('-') and len(set(arg[1:])-set(map(str,range(10)))): sys.stderr.write("ERROR! Unknown option %s\n"%arg) exit() else: From a836b0708a80b665747d906d6ad3b4e4fd4f4374 Mon Sep 17 00:00:00 2001 From: Chengxin Zhang Date: Wed, 20 Feb 2019 20:01:24 -0500 Subject: [PATCH 10/32] suport nucleic acid --- fixMSE.py | 2 ++ pdb2fasta.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fixMSE.py b/fixMSE.py index 6bcbeb4..9d6c1e8 100755 --- a/fixMSE.py +++ b/fixMSE.py @@ -45,6 +45,8 @@ 'TYS':'Y', 'TYY':'Y', 'AGM':'R', 'GL3':'G', 'SMC':'C', 'CGU':'E', 'CSX':'C', 'SEC':'U', 'PYL':'O', 'ASX':'B', 'GLX':'Z', 'LLP':'X', 'UNK':'X', + ' DA':'a', ' DC':'c', ' DG':'g', ' DT':'t', ' DU':'u', + ' A':'a', ' C':'c', ' G':'g', ' T':'t', ' U':'u', } aa1to3 = { # 3 letter to 1 letter amino acid code conversion diff --git a/pdb2fasta.py b/pdb2fasta.py index 63f4d38..37f12db 100755 --- a/pdb2fasta.py +++ b/pdb2fasta.py @@ -220,7 +220,7 @@ def pdbtxt2seq(txt='',infile='pdb.pdb',PERMISSIVE="MSE",outfmt="PDB", chain_dict[chain_id]+=tmp_seq continue - if line[13:15]!="CA": # Carbon Alpha Only + if line[12:16]!=" CA " and line[12:16]!=" C3'": # Carbon Alpha Only continue if not line[16] in [' ','A']: # remove alternative location continue From 44206f9664e932eba4e92fbe98c51e151f78db43 Mon Sep 17 00:00:00 2001 From: Chengxin Zhang Date: Thu, 21 Feb 2019 13:16:55 -0500 Subject: [PATCH 11/32] choose different molecule type --- pdb2fasta.py | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/pdb2fasta.py b/pdb2fasta.py index 37f12db..8f7c4eb 100755 --- a/pdb2fasta.py +++ b/pdb2fasta.py @@ -25,6 +25,12 @@ true - convert from "SEQRES" entries if present, otherwise convert from "ATOM" entries. (For PDB format input only) + +-mol={all,protein,rna,dna} which macromolecule type to use + all - use " CA " or " C3'" atoms + protein - use " CA " atoms + rna - use " C3'" atoms + dna - use " C3'" atoms, the same as rna ''' import sys,os import shutil @@ -39,10 +45,12 @@ 'ARG':'R', 'SER':'S', 'THR':'T', 'TYR':'Y', 'HIS':'H', 'CYS':'C', 'ASN':'N', 'GLN':'Q', 'TRP':'W', 'GLY':'G', #'MSE':'M', + ' DA':'a', ' DC':'c', ' DG':'g', ' DT':'t', + ' A':'a', ' C':'c', ' G':'g', ' T':'u', } def pdbbundle2seq(tarball_name="pdb-bundle.tar.gz",PERMISSIVE="MSE", - outfmt="PDB",allowX=True): + outfmt="PDB",allowX=True,mol="all"): '''convert best effort/minimum PDB bundle to sequence ''' chain_id_mapping=dict() @@ -76,7 +84,7 @@ def pdbbundle2seq(tarball_name="pdb-bundle.tar.gz",PERMISSIVE="MSE", txt=fp.read() fp.close() header_list_tmp,sequence_list_tmp=pdbtxt2seq( - txt,pdb_bundle_name,PERMISSIVE,outfmt,allowX) + txt,pdb_bundle_name,PERMISSIVE,outfmt,allowX,mol) if outfmt=="PDB": header_list+=[chain_id_mapping[h] for h \ in header_list_tmp] @@ -90,7 +98,7 @@ def pdbbundle2seq(tarball_name="pdb-bundle.tar.gz",PERMISSIVE="MSE", return header_list,sequence_list def pdb2seq(infile="pdb.pdb", PERMISSIVE="MSE", outfmt="PDB", - allowX=True, SEQRES=False): + allowX=True, SEQRES=False,mol="all"): '''Convert PDB to sequence. Return two lists, one for headers and the other for sequence. @@ -100,16 +108,16 @@ def pdb2seq(infile="pdb.pdb", PERMISSIVE="MSE", outfmt="PDB", MSE: (default) Disallow any non-standard amino acid apart from MSE ''' if infile.endswith(".tar.gz"): # best effort/minimum PDB bundle - return pdbbundle2seq(infile,PERMISSIVE,outfmt,allowX) - elif infile.endswith(".cif") or infile.endswith(".cif.gz"): # PDBx/mmCIF - return mmCIF2seq(infile,PERMISSIVE,outfmt,allowX) + return pdbbundle2seq(infile,PERMISSIVE,outfmt,allowX,mol) + #elif infile.endswith(".cif") or infile.endswith(".cif.gz"): # PDBx/mmCIF + #return mmCIF2seq(infile,PERMISSIVE,outfmt,allowX) elif infile.endswith(".gz"): fp=gzip.open(infile,'rU') else: fp=open(infile,'rU') txt=fp.read() fp.close() - return pdbtxt2seq(txt,infile,PERMISSIVE,outfmt,allowX,SEQRES) + return pdbtxt2seq(txt,infile,PERMISSIVE,outfmt,allowX,SEQRES,mol) def mmCIF2seq(infile="pdb.cif", PERMISSIVE="MSE",outfmt="PDB", allowX=True): @@ -182,7 +190,7 @@ def mmCIF2seq(infile="pdb.cif", PERMISSIVE="MSE",outfmt="PDB", return header_list,sequence_list def pdbtxt2seq(txt='',infile='pdb.pdb',PERMISSIVE="MSE",outfmt="PDB", - allowX=True,SEQRES=False): + allowX=True,SEQRES=False,mol="all"): '''Convert PDB text "txt" to sequence read from PDB file "infile" Return two lists, one for headers and the other for sequence. @@ -194,6 +202,7 @@ def pdbtxt2seq(txt='',infile='pdb.pdb',PERMISSIVE="MSE",outfmt="PDB", txt=txt.split("\nENDMDL")[0] # Only the first model if not "SEQRES" in txt: SEQRES=False # use "ATOM" is "SEQRES" is absent + mol=mol.lower() aa3to1=code_with_modified_residues @@ -220,7 +229,9 @@ def pdbtxt2seq(txt='',infile='pdb.pdb',PERMISSIVE="MSE",outfmt="PDB", chain_dict[chain_id]+=tmp_seq continue - if line[12:16]!=" CA " and line[12:16]!=" C3'": # Carbon Alpha Only + if (mol=="protein" and line[12:16]!=" CA ") or \ + (mol in {"rna","dna"} and line[12:16]!=" C3'") or \ + (mol=="all" and line[12:16]!=" CA " and line[12:16]!=" C3'"): continue if not line[16] in [' ','A']: # remove alternative location continue @@ -271,9 +282,9 @@ def pdbtxt2seq(txt='',infile='pdb.pdb',PERMISSIVE="MSE",outfmt="PDB", def pdb2fasta(infile="pdb.pdb", PERMISSIVE="MSE", outfmt="PDB", - allowX=True,SEQRES=False): + allowX=True,SEQRES=False, mol="all"): '''Convert PDB to FASTA''' - header_list,sequence_list=pdb2seq(infile,PERMISSIVE,outfmt,allowX,SEQRES) + header_list,sequence_list=pdb2seq(infile,PERMISSIVE,outfmt,allowX,SEQRES,mol) fasta_list=['>'+header_list[i]+'\n'+ \ sequence_list[i] for i in range(len(sequence_list))] return '\n'.join(fasta_list)+'\n' @@ -283,6 +294,7 @@ def pdb2fasta(infile="pdb.pdb", PERMISSIVE="MSE", outfmt="PDB", outfmt="PDB" allowX=True SEQRES=False + mol="all" argv=[] for arg in sys.argv[1:]: if arg.startswith("-outfmt="): @@ -293,6 +305,8 @@ def pdb2fasta(infile="pdb.pdb", PERMISSIVE="MSE", outfmt="PDB", allowX=(arg[len("-allowX="):].lower()=="true") elif arg.startswith("-SEQRES="): SEQRES=(arg[len("-SEQRES="):].lower()=="true") + elif arg.startswith("-mol="): + mol=arg[len("-mol="):].lower() elif arg.startswith("-"): sys.stderr.write("ERROR! Unknown argument %s\n"%arg) exit() @@ -305,4 +319,4 @@ def pdb2fasta(infile="pdb.pdb", PERMISSIVE="MSE", outfmt="PDB", for pdb in argv: sys.stdout.write(pdb2fasta( pdb, PERMISSIVE=PERMISSIVE, outfmt=outfmt, allowX=allowX, - SEQRES=SEQRES)) + SEQRES=SEQRES, mol=mol)) From 4f50518434c1d1d25ad38290c313435b777419a6 Mon Sep 17 00:00:00 2001 From: Chengxin Zhang Date: Thu, 21 Feb 2019 13:34:54 -0500 Subject: [PATCH 12/32] fix pdbbundle molecule type --- pdb2fasta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdb2fasta.py b/pdb2fasta.py index 8f7c4eb..ef5708a 100755 --- a/pdb2fasta.py +++ b/pdb2fasta.py @@ -84,7 +84,7 @@ def pdbbundle2seq(tarball_name="pdb-bundle.tar.gz",PERMISSIVE="MSE", txt=fp.read() fp.close() header_list_tmp,sequence_list_tmp=pdbtxt2seq( - txt,pdb_bundle_name,PERMISSIVE,outfmt,allowX,mol) + txt,pdb_bundle_name,PERMISSIVE,outfmt,allowX,False,mol) if outfmt=="PDB": header_list+=[chain_id_mapping[h] for h \ in header_list_tmp] From 25165fe97651974f0dd2a46f96b848be9a6c7a71 Mon Sep 17 00:00:00 2001 From: Zhang Date: Mon, 11 Mar 2019 16:25:33 -0400 Subject: [PATCH 13/32] distance_pdb --- distance_pdb.py | 276 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 276 insertions(+) create mode 100755 distance_pdb.py diff --git a/distance_pdb.py b/distance_pdb.py new file mode 100755 index 0000000..f25cac0 --- /dev/null +++ b/distance_pdb.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python +# 2016-01-28 Chengxin Zhang +docstring=''' +distance_pdb.py [options] pdb.pdb + Calculate residue distance bins in single chain PDB file "pdb.pdb" + +Options: + -atom={CA,CB} calculate distance between "CA" for all residues, or "CA" for + gly and "CB" for other 19 amino acids + + -outfmt={stat,list,dist} output format: + "list": tab-eliminated list listing residue index for contact pairs + "dist": tab-eliminated list listing residue distances for all pairs + "stat": statistics on number of contacts at short/medm/long/all + range and protein length L + + -range={all,short,medium,long} sequences seperation range x + "all": 1<=x + "short": 6<=x<12 + "medium": 12<=x<24 + "long": 24<=x (most useful) + (default): 6<=x + + +distance_pdb.py [options] pdb.pdb distance.map + Calculate accuracy of sorted residue distance map "distance.map" + according to single chain PDB file "pdb.pdb" + + "distance.map" should be of ResTriplet2 format + +Options: + -cutoff=8 + + -atom={CB,CA} atom with which contact is considered. + CB - CA for GLY, CB for other 19 AA + CA - CA for all 20 AA + + -range={all,short,medium,long} + + -offset=0 add "offset" to residue index in predicted contact map + -infmt={rr,gremlin,pdb} input format of contact map + res - ResTriplet2 + pdb - pdb coordinate file +''' +import sys,os +import re +import gzip + +from contact_pdb import calc_res_dist,calc_res_contact,calc_contact_num +#import numpy as np +#bins=np.array([0]+list(np.arange(2,22,20./64))+[22]) +bins=[0]+[20./64*i+2 for i in range(64)]+[22] + +# see http://predictioncenter.org/casp12/doc/rr_help.html for contact range +# defination. Note that NeBcon/NN-BAYES uses different defination for long +# range contact +short_range_def=6 # short_range_def <= separation < medm_range_def +medm_range_def=12 # medm_range_def <= separation < long_range_def +long_range_def=24 # long_range_def <= separation. 25 in NeBcon/NN-BAYES + +def read_pseudo_distance_map(infile="model1.pdb",atom_sele="CB", + sep_range=str(short_range_def),offset=0): + res_dist_list=calc_res_dist(infile,atom_sele) + res_con_list=calc_res_contact(res_dist_list,sep_range,cutoff=bins[-1]) + resi1,resi2,p=zip(*res_con_list) + p,resi1,resi2=zip(*sorted(zip(p,resi1,resi2))) + res_con_list=zip(resi1,resi2,p) + return calc_res_bin(res_con_list) + +def read_distance_map(infile="dist.distri",sep_range=str(short_range_def),offset=0): + '''Read NN-BAYES or CASP RR format contact map. return them in a zipped + list with 3 fields for each residue pair. 1st field & 2nd filed are for + residue indices, and 3rd field is for euclidean distance. + ''' + resi1=[] # residue index 1 list + resi2=[] # residue index 2 list + p=[] # cscore of contact prediction accuracy list + fp=sys.stdin + if infile!='-': + if infile.endswith(".gz"): + fp=gzip.open(infile,'rU') + else: + fp=open(infile,'rU') + lines=fp.read().strip().splitlines() + fp.close() + pattern=re.compile('(^\d+\s+\d+\s+\d+\s+\d+\s+[-+.e\d]+)|(^\d+\s+\d+\s+[-+.e\d]+)|(^\d+\s+[A-Z]\s+\d+\s+[A-Z]\s+[-+.e\d]+\s+[-+.e\d]+)') + if infmt!="rr": + for i,line in enumerate(lines): + for j,cscore in enumerate(line.split()): + if (j<=i): + continue + seperation=abs(i-j) + if (sep_range=="short" and not \ + short_range_def<=seperation1: + sys.stderr.write("ERROR! Unknown argument %s\n"%arg) + exit() + else: + file_list.append(arg) + if not file_list: + sys.stderr.write(docstring+"\nERROR! No PDB file") + exit() + + res_dist_list=calc_res_dist(file_list[0],atom_sele) + res_con_list=calc_res_contact(res_dist_list,sep_range,cutoff=bins[-1]) + #res_con_list=calc_res_contact(res_dist_list,sep_range,cutoff=0) + res_bin_list=calc_res_bin(res_con_list) + + L=map(list,zip(*res_dist_list)) + L=L[0]+L[1] + L=max(L)-min(L)+1 + + if len(file_list)==1: # calculate residue contact + if outfmt=="dist": + for res_pair in res_con_list: + sys.stdout.write("%d\t%d\t%.1f\n"%(res_pair[0],res_pair[1],res_pair[2])) + elif outfmt=="list": + for res_pair in res_bin_list: + sys.stdout.write("%d\t%d\t%d\n"%(res_pair[0],res_pair[1],res_pair[2])) + elif outfmt.startswith("stat") or outfmt.startswith("lnat"): + con_num_dict=calc_contact_num(res_con_list,L) + key_list=["short","medm","long","all","L"] + sys.stderr.write('\t'.join(key_list)+'\n') + sys.stdout.write('\t'.join([str(con_num_dict[key] + ) for key in key_list])+'\n') + + + elif len(file_list)==2: # calculate distance prediction accuracy + if infmt!="pdb": + res_pred_list=read_distance_map(file_list[1], sep_range,offset) + else: + res_pred_list=read_pseudo_distance_map(file_list[1], + atom_sele, sep_range, offset) + cmp_list=compare_res_contact(res_bin_list,res_pred_list) + + con_num_dict=calc_contact_num(res_con_list,L) + ACC,top_pred=calc_lnat_acc_distance(cmp_list,con_num_dict,sep_range) + if sep_range == "short": + key_list=["short"] + elif sep_range == "medium": + key_list=["medm"] + elif sep_range == "long": + key_list=["long"] + else: + key_list=["short", "medm", "long", "all"] + sys.stderr.write('\t'.join(key_list)+'\n') + sys.stdout.write('\t'.join(['%.3f'%ACC[key] for key in key_list] + )+'\n') + else: + sys.stderr.write(docstring+"ERROR! Too many arguments.\n") From dab1866676d28dddb1c4f748d3b843f80d3dc863 Mon Sep 17 00:00:00 2001 From: Chengxin Zhang Date: Mon, 11 Mar 2019 20:14:05 -0400 Subject: [PATCH 14/32] read real distance map --- distance_pdb.py | 62 ++++++++++++++++--------------------------------- 1 file changed, 20 insertions(+), 42 deletions(-) diff --git a/distance_pdb.py b/distance_pdb.py index f25cac0..5f344f0 100755 --- a/distance_pdb.py +++ b/distance_pdb.py @@ -38,7 +38,7 @@ -range={all,short,medium,long} -offset=0 add "offset" to residue index in predicted contact map - -infmt={rr,gremlin,pdb} input format of contact map + -infmt={res,pdb} input format of contact map res - ResTriplet2 pdb - pdb coordinate file ''' @@ -67,14 +67,15 @@ def read_pseudo_distance_map(infile="model1.pdb",atom_sele="CB", res_con_list=zip(resi1,resi2,p) return calc_res_bin(res_con_list) -def read_distance_map(infile="dist.distri",sep_range=str(short_range_def),offset=0): +def read_distance_map(infile="dist.distri",sep_range=str(short_range_def), + offset=0): '''Read NN-BAYES or CASP RR format contact map. return them in a zipped list with 3 fields for each residue pair. 1st field & 2nd filed are for residue indices, and 3rd field is for euclidean distance. ''' resi1=[] # residue index 1 list resi2=[] # residue index 2 list - p=[] # cscore of contact prediction accuracy list + p=[] # distance bin of each residue pair fp=sys.stdin if infile!='-': if infile.endswith(".gz"): @@ -83,42 +84,17 @@ def read_distance_map(infile="dist.distri",sep_range=str(short_range_def),offset fp=open(infile,'rU') lines=fp.read().strip().splitlines() fp.close() - pattern=re.compile('(^\d+\s+\d+\s+\d+\s+\d+\s+[-+.e\d]+)|(^\d+\s+\d+\s+[-+.e\d]+)|(^\d+\s+[A-Z]\s+\d+\s+[A-Z]\s+[-+.e\d]+\s+[-+.e\d]+)') - if infmt!="rr": - for i,line in enumerate(lines): - for j,cscore in enumerate(line.split()): - if (j<=i): - continue - seperation=abs(i-j) - if (sep_range=="short" and not \ - short_range_def<=seperationmax_prob: + p[-1]=int(b) + max_prob=data[b] return zip(resi1,resi2,p) def compare_res_contact(res_bin_list,res_pred_list): @@ -150,7 +126,10 @@ def compare_res_contact(res_bin_list,res_pred_list): distance prediction is correct.''' cmp_list=[] for i,j,b in res_pred_list: - cmp_list.append((i,j, (i,j,b) in res_bin_list )) + cmp_list.append((i,j, 1.*((i,j,b) in res_bin_list)+ \ + .75*(((i,j,b+1) in res_bin_list) or ((i,j,b-1) in res_bin_list))+ \ + .50*(((i,j,b+2) in res_bin_list) or ((i,j,b-2) in res_bin_list))+ \ + .25*(((i,j,b+3) in res_bin_list) or ((i,j,b-3) in res_bin_list)))) return cmp_list def calc_lnat_acc_distance(cmp_list,con_num_dict,sep_range=str(short_range_def)): @@ -229,7 +208,6 @@ def calc_res_bin(res_con_list): res_dist_list=calc_res_dist(file_list[0],atom_sele) res_con_list=calc_res_contact(res_dist_list,sep_range,cutoff=bins[-1]) - #res_con_list=calc_res_contact(res_dist_list,sep_range,cutoff=0) res_bin_list=calc_res_bin(res_con_list) L=map(list,zip(*res_dist_list)) From e6263e25dc1d432e938000728bc435905d93400e Mon Sep 17 00:00:00 2001 From: Chengxin Zhang Date: Tue, 12 Mar 2019 21:56:56 -0400 Subject: [PATCH 15/32] uniprot api --- batch_retrieval.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/batch_retrieval.py b/batch_retrieval.py index e878893..c4ac2cd 100755 --- a/batch_retrieval.py +++ b/batch_retrieval.py @@ -75,7 +75,8 @@ def batch_retrival(query_list=[],infmt="ACC",outfmt="fasta",db=''): } data = urllib.urlencode(params) - request = urllib2.Request(url_mapping, data) + #request = urllib2.Request(url_mapping, data) + request = urllib2.Request(url_upload, data) request.add_header('User-Agent', 'Python %s' % email) response = urllib2.urlopen(request) page = response.read() From 00377fa018eefff86ea033eac5c849861c333637 Mon Sep 17 00:00:00 2001 From: Chengxin Zhang Date: Tue, 12 Mar 2019 21:57:28 -0400 Subject: [PATCH 16/32] fix bug in reading single decoy rep*tra* file --- reptra2pdb.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/reptra2pdb.py b/reptra2pdb.py index da3c751..7f38f52 100755 --- a/reptra2pdb.py +++ b/reptra2pdb.py @@ -53,6 +53,8 @@ def split_reptra(infile="rep1.tra1.bz2"): pattern=re.compile("\n\s*\d+\s+[-]{0,1}[.\d]+\s+\d+\s+\d+\n") # index for start sections of each decoy (except the first section) match_index=[e.start()+1 for e in pattern.finditer(txt)] + if not match_index: + return [txt] decoy_txt_list=[txt[:match_index[0]]] + \ [txt[match_index[idx]:match_index[idx+1]] \ From fc8c31557b77847a7f0f6469924516e54161fc81 Mon Sep 17 00:00:00 2001 From: zcx Date: Mon, 15 Apr 2019 16:56:10 -0400 Subject: [PATCH 17/32] xvg2png for gromacs --- readme.zcx | 1 + xvg2png.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100755 xvg2png.py diff --git a/readme.zcx b/readme.zcx index 6ffd531..b59c601 100644 --- a/readme.zcx +++ b/readme.zcx @@ -28,6 +28,7 @@ strip_sidechain.py # remove side DomainParser.py # partition a PDB chain into domains split_uniprot.py # partition a PDB file into different uniprot ID pdb2fasta.py # convert PDB file to fasta file +xvg2png.py # plot gromacs xvf file to png image #### GO parsing #### obo2csv.py # parsing obo format GO defination file diff --git a/xvg2png.py b/xvg2png.py new file mode 100755 index 0000000..7295e29 --- /dev/null +++ b/xvg2png.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python +docstring=''' +xvg2img.py gromacs.xvg gromacs.png + convert gromacs format image gromacs.xvg to png format image gromacs.png +''' +import matplotlib +matplotlib.use("agg") +from matplotlib import pyplot as plt +import numpy as np +import sys +import re + +title_pat=re.compile('@\s+title\s+\"([\s\S]+?)\"\n') +yaxis_pat=re.compile('@\s+yaxis\s+label\s+\"([\s\S]+?)\"\n') +xaxis_pat=re.compile('@\s+xaxis\s+label\s+\"([\s\S]+?)\"\n') +legend_pat=re.compile('@\s+\S+\s+legend\s+\"([\s\S]+?)\"\n') + +def xvg2img(infile,outfile): + data=np.loadtxt(infile,comments=["#","@"]) + fp=open(infile,'rU') + txt=fp.read() + fp.close() + + title_list =title_pat.findall(txt) + yaxis_list =yaxis_pat.findall(txt) + xaxis_list =xaxis_pat.findall(txt) + legend_list=legend_pat.findall(txt) + + plt.figure() + for i in range(1,data.shape[1]): + if len(legend_list)>=i: + plt.plot(data[:,0],data[:,i],label=legend_list[i-1]) + else: + plt.plot(data[:,0],data[:,i]) + + if title_list: + plt.title(title_list[0]) + if yaxis_list: + plt.ylabel(yaxis_list[0]) + if xaxis_list: + plt.xlabel(xaxis_list[0]) + if len(legend_list)>1: + plt.legend(loc="best") + plt.savefig(outfile,dpi=300) + return + +if __name__=="__main__": + if len(sys.argv)<=1: + sys.stderr.write(docstring) + exit() + xvg2img(sys.argv[1],sys.argv[2]) From 536ad9b749614b27bbf1d3128fc4ff67ab10a15c Mon Sep 17 00:00:00 2001 From: kad-ecoli Date: Sat, 20 Apr 2019 11:04:13 -0400 Subject: [PATCH 18/32] tmap format --- contact_pdb.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/contact_pdb.py b/contact_pdb.py index b38d6a2..5eb851b 100755 --- a/contact_pdb.py +++ b/contact_pdb.py @@ -15,6 +15,7 @@ "dist": tab-eliminated list listing residue distances for all pairs "stat": statistics on number of contacts at short/medm/long/all range and protein length L + "npy": matrix for CNN training -range={all,short,medium,long} sequences seperation range x "all": 1<=x @@ -424,6 +425,9 @@ def calc_res_contact(res_dist_list,sep_range=str(short_range_def),cutoff=8): if not file_list: sys.stderr.write(docstring+"\nERROR! No PDB file") exit() + + if outfmt=="npy": + sep_range="all" res_dist_list=calc_res_dist(file_list[0],atom_sele) res_con_list=calc_res_contact(res_dist_list,sep_range,cutoff) @@ -433,17 +437,25 @@ def calc_res_contact(res_dist_list,sep_range=str(short_range_def),cutoff=8): L=max(L)-min(L)+1 if len(file_list)==1: # calculate residue contact + if outfmt=="npy": + import numpy as np + tmap=np.zeros((L,L),dtype=np.float32 + )+np.diag(np.ones(L,dtype=np.float32)) for res_pair in res_con_list: if outfmt.startswith("dist"): sys.stdout.write("%d\t%d\t%.1f\n"%(res_pair[0],res_pair[1],res_pair[2])) elif outfmt=="list": sys.stdout.write("%d\t%d\n"%(res_pair[0],res_pair[1])) + elif outfmt=="npy": + tmap[res_pair[0]-1][res_pair[1]-1]=1 if outfmt.startswith("stat") or outfmt.startswith("lnat"): con_num_dict=calc_contact_num(res_con_list,L) key_list=["short","medm","long","all","L"] sys.stderr.write('\t'.join(key_list)+'\n') sys.stdout.write('\t'.join([str(con_num_dict[key] ) for key in key_list])+'\n') + elif outfmt=="npy": + np.save(sys.stdout,tmap.reshape(L*L)) if not cutoff and outfmt=="list": sys.stderr.write("\nWARNING! cutoff not set\n\n") From 13ff97cb8433c80dbf6f616fc19db19c80ea8929 Mon Sep 17 00:00:00 2001 From: kad-ecoli Date: Sat, 20 Apr 2019 11:08:49 -0400 Subject: [PATCH 19/32] fix symmetry in tmap --- contact_pdb.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/contact_pdb.py b/contact_pdb.py index 5eb851b..60b19bf 100755 --- a/contact_pdb.py +++ b/contact_pdb.py @@ -447,7 +447,8 @@ def calc_res_contact(res_dist_list,sep_range=str(short_range_def),cutoff=8): elif outfmt=="list": sys.stdout.write("%d\t%d\n"%(res_pair[0],res_pair[1])) elif outfmt=="npy": - tmap[res_pair[0]-1][res_pair[1]-1]=1 + tmap[res_pair[0]-1][res_pair[1]-1]= \ + tmap[res_pair[1]-1][res_pair[0]-1]=1 if outfmt.startswith("stat") or outfmt.startswith("lnat"): con_num_dict=calc_contact_num(res_con_list,L) key_list=["short","medm","long","all","L"] From b8d08607a00b90c678681c488a63d8a178b42670 Mon Sep 17 00:00:00 2001 From: Chengxin Zhang Date: Fri, 3 May 2019 11:58:21 -0400 Subject: [PATCH 20/32] fix raise error --- obo2csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/obo2csv.py b/obo2csv.py index c473845..4f19385 100755 --- a/obo2csv.py +++ b/obo2csv.py @@ -255,7 +255,7 @@ def Term(self,Term_id=""): return [v for k,v in self[Aspect]["Term"].items()] sys.stderr.write("ERROR! Cannot find GO Term %s\n"%Term_id) - raise + return [] def has_a(self,Term_id=""): '''cumulatively list all child GO id''' From 3071d1fb01bf1320ca75411f53d0f8ba6e625a50 Mon Sep 17 00:00:00 2001 From: zcx Date: Sun, 28 Jul 2019 11:09:09 -0400 Subject: [PATCH 21/32] fasta2pfam can read from stdin --- fasta2pfam.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fasta2pfam.py b/fasta2pfam.py index df8af8c..d0df1b0 100755 --- a/fasta2pfam.py +++ b/fasta2pfam.py @@ -36,7 +36,10 @@ def fasta2pfam(fasta_txt="seq.fasta",seqnamelen=0): exit() for arg in argv: - fp=open(arg,'rU') - fasta_txt=fp.read() - fp.close() + if arg=='-': + fasta_txt=sys.stdin.read() + else: + fp=open(arg,'rU') + fasta_txt=fp.read() + fp.close() sys.stdout.write(fasta2pfam(fasta_txt)) From 63a6aee3c891a3736c6bc0142ee6eca4a648c38b Mon Sep 17 00:00:00 2001 From: zcx Date: Sun, 28 Jul 2019 11:09:37 -0400 Subject: [PATCH 22/32] distance prediction evaluation in contact_pdb.py --- contact_pdb.py | 155 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 135 insertions(+), 20 deletions(-) diff --git a/contact_pdb.py b/contact_pdb.py index 60b19bf..ef3f476 100755 --- a/contact_pdb.py +++ b/contact_pdb.py @@ -5,7 +5,8 @@ Calculate residue contacts in single chain PDB file "pdb.pdb" Options: - -cutoff=8 distance cutoff (in Angstrom) for contact, usu between 6 and 12 + -cutoff=8 distance cutoff (in Angstrom) for contact + default value is 22 for -infmt=dist and 8 otherwise -atom={CA,CB} calculate distance between "CA" for all residues, or "CA" for gly and "CB" for other 19 amino acids @@ -55,6 +56,7 @@ "lnat": statistics on accuracy (ACC): (contact ACC for top Lnative, where Lnative is the number of native contacts) #short medm long all + default value is "lnat" if -infmt=dist, and "stat" otherwise -cutoff_all=0 # ignore contact prediction p<=0 -cutoff_short=0.5 # ignore short range contact prediction p<=0.5 @@ -62,10 +64,11 @@ -cutoff_long=0.3 # ignore medium range contact prediction p<=0.3 -offset=0 add "offset" to residue index in predicted contact map - -infmt={rr,gremlin,pdb} input format of contact map + -infmt={rr,gremlin,pdb,dist} input format of contact map rr - CASP RR, NeBcon, or mfDCA format gremlin - matrix of confidence score pdb - pdb coordinate file + dist - fitted distance prediction by ResTriplet2 ''' import sys,os import re @@ -92,6 +95,22 @@ def read_pseudo_contact_map(infile="model1.pdb",atom_sele="CB", p,resi1,resi2=map(list,zip(*sorted(zip(p,resi1,resi2),reverse=True))) return zip(resi1,resi2,p) +def read_distance_map(infile="model1.pdb",offset=0): + fp=open(infile,'rU') + lines=fp.read().splitlines() + fp.close() + resi1=[] + resi2=[] + mu_list=[] + sigma_list=[] + for line in lines: + i,j,mu,sigma=line.split()[:4] + resi1.append(int(i)+offset) + resi2.append(int(j)+offset) + mu_list.append(float(mu)) + sigma_list.append(float(sigma)) + return zip(resi1,resi2,mu_list,sigma_list) + def read_contact_map(infile="contact.map", cutoff_all=0,cutoff_short=0,cutoff_medium=0,cutoff_long=0, sep_range=str(short_range_def),offset=0,infmt="rr"): @@ -267,6 +286,70 @@ def compare_res_contact(res_dist_list,res_pred_list,cutoff=8): cmp_list=zip(resi1,resi2,dist,contact,p) return cmp_list + +def compare_res_dist(res_dist_list,dist_pred_list,cutoff=8): + '''compare residue distance map "res_dist_list" calculate from pdb to + predicted residue contact "dist_pred_list. return the result in a zipped + list with 6 fields for each pair. 1st field & 2nd filed are for residue + indices, 3rd field is for euclidean distance, 4th field for distance + prediction. 5th field for absolute difference in predicted and native + distance. 6th field for predicted deviation. + ''' + res_dist_dict=dict() # key is residue pair, value is distance + for i,j,dist in res_dist_list: + res_dist_dict[(i,j)]=dist + res_pred_dict=dict() # key is residue pair, value is cscore + for i,j,mu,sigma in dist_pred_list: + res_pred_dict[(i,j)]=(sigma,mu) + + cmp_list=[] + for i,j in set(res_dist_dict.keys()).intersection(res_pred_dict.keys()): + dist=res_dist_dict[(i,j)] + sigma,mu=res_pred_dict[(i,j)] + cmp_list.append((sigma,i,j,dist,mu,abs(dist-mu))) + + #if len(cmp_list)==0: + #return [] + + # sort on sigma + sigma_list,resi1_list,resi2_list,dist_list,mu_list,err_list=map( + list,zip(*sorted(cmp_list,reverse=False))) + return zip(resi1_list,resi2_list,dist_list,mu_list,err_list,sigma_list) + +def calc_lnat_acc_dist(cmp_list,con_num_dict,sep_range=str(short_range_def)): + '''Calculate residue contact accuracy using ouput if "compare_res_contact" + and native contact number diction "con_num_dict" ''' + top_pred=dict() # top short, medm, long, all prediction + + if not sep_range in ["medium","long"]: + top_pred["short"]=[res_pair for res_pair in cmp_list if \ + short_range_def<=abs(res_pair[0]-res_pair[1]) Date: Sun, 28 Jul 2019 13:37:46 -0400 Subject: [PATCH 23/32] correct drmsd calculation --- contact_pdb.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contact_pdb.py b/contact_pdb.py index ef3f476..2016078 100755 --- a/contact_pdb.py +++ b/contact_pdb.py @@ -344,8 +344,8 @@ def calc_lnat_acc_dist(cmp_list,con_num_dict,sep_range=str(short_range_def)): for key in top_pred: ACC[key]=0 if top_pred[key]: - ACC[key]=((sum([e[4]*e[4] for e in top_pred[key]]))**.5 - )/len(top_pred[key]) + ACC[key]=(sum([e[4]*e[4] for e in top_pred[key]] + )/len(top_pred[key]))**.5 coef_sigma[key]=sum([e[4]/e[5] for e in top_pred[key]] )/len(top_pred[key]) return ACC,coef_sigma,top_pred From e3d0e2b85a6a9dbe698f65119033abbf15bb9e38 Mon Sep 17 00:00:00 2001 From: zcx Date: Thu, 19 Sep 2019 16:24:46 -0400 Subject: [PATCH 24/32] gallary view of many pdb files --- batch_pdb_image.py | 83 ++++++++++++++++++++++++++++++++++++++++++++++ batch_retrieval.py | 2 +- readme.zcx | 1 + 3 files changed, 85 insertions(+), 1 deletion(-) create mode 100755 batch_pdb_image.py diff --git a/batch_pdb_image.py b/batch_pdb_image.py new file mode 100755 index 0000000..319910e --- /dev/null +++ b/batch_pdb_image.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python +docstring=''' +batch_pdb_image.py list output + Use pymol to generate gallery view of pdb files listed by "list", + and output png format image to "output.*.png" +''' +import sys, os, subprocess +import matplotlib +matplotlib.use("agg") +from matplotlib import pyplot as plt +from string import Template + +pymol_template=Template(';'.join([ + 'pymol -c -q -d "load $infile, infile', + 'viewport 640, 640', + 'zoom', + #'center infile', + #'zoom center, 20', + 'bg_color white', + 'hide all', + 'show cartoon', + 'spectrum', + 'set ray_shadow, 0', + 'set ray_opaque_background, off', + 'ray 640, 640', # by default, this is 640x480 + 'png $outfile"'])) +res_count_template=Template('grep " CA " $infile|grep "^ATOM"|wc -l') + +def batch_pdb_image(infile_list,outfile): + outfile_list=[] + plt_count=0 + ncol=4 + nrow=5 + gallery_count=1 + fontsize=10 + plt.figure(gallery_count,figsize=(8.27,11.69)) + for infile in infile_list: + plt_count+=1 + print(infile) + cmd=pymol_template.substitute( + infile=infile, + outfile=outfile+".png", + ) + p=subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE) + p.communicate() + cmd=res_count_template.substitute(infile=infile) + p=subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE) + Lch=int(p.communicate()[0]) + plt.subplot(nrow,ncol,plt_count) + img=plt.imshow(plt.imread(outfile+".png")) + os.system("rm %s.png"%outfile) + plt.axis("off") + plt.title("(%s) %s (%d)"%( + "ABCDEFGHIJKLMNOPQRSTUVWXYZ"[plt_count-1],infile,Lch), + fontsize=fontsize) + if plt_count==ncol*nrow: + plt_count=0 + outfile_list.append("%s.%d.png"%(outfile,gallery_count)) + print(outfile_list[-1]) + plt.tight_layout(pad=0,h_pad=0,w_pad=0) + plt.savefig(outfile_list[-1],dpi=300) + plt.close() + gallery_count+=1 + plt.figure(gallery_count,figsize=(8.27,11.69)) + if plt_count: + outfile_list.append("%s.%d.png"%(outfile,gallery_count)) + plt.tight_layout(pad=0,h_pad=0,w_pad=0) + plt.savefig(outfile_list[-1],dpi=300) + print(outfile_list[-1]) + return outfile_list + +if __name__=="__main__": + if len(sys.argv)!=3: + sys.stderr.write(docstring) + exit() + + fp=open(sys.argv[1],'rU') + infile_list=fp.read().splitlines() + fp.close() + + outfile=sys.argv[2] + + outfile_list=batch_pdb_image(infile_list,outfile) diff --git a/batch_retrieval.py b/batch_retrieval.py index c4ac2cd..b1846ab 100755 --- a/batch_retrieval.py +++ b/batch_retrieval.py @@ -34,7 +34,7 @@ email = "zcx@umich.edu" # uniprot cannot parse very long list. If given a long list, the full query # list will be splitted into small lists of "split_size" entries -split_size=20000 +split_size=10000 from string import Template import requests diff --git a/readme.zcx b/readme.zcx index b59c601..6cc6c77 100644 --- a/readme.zcx +++ b/readme.zcx @@ -29,6 +29,7 @@ DomainParser.py # partition a PDB chain into domains split_uniprot.py # partition a PDB file into different uniprot ID pdb2fasta.py # convert PDB file to fasta file xvg2png.py # plot gromacs xvf file to png image +batch_pdb_image.py # gallery view of many pdb files #### GO parsing #### obo2csv.py # parsing obo format GO defination file From b1ad0d15300615ab93bab3f09ee204a17a16b08f Mon Sep 17 00:00:00 2001 From: zcx Date: Sat, 12 Oct 2019 08:52:51 -0400 Subject: [PATCH 25/32] direct transformation by matrix --- superpose.py | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 114 insertions(+), 2 deletions(-) diff --git a/superpose.py b/superpose.py index 635d8b3..5013c2c 100755 --- a/superpose.py +++ b/superpose.py @@ -8,8 +8,10 @@ contains coordinates of "model.pdb" after superposition options: - -algo={TMalign,TMscore,MMalign,pymol-super,pymol-cealign,pymol-align} + -algo={TMalign,TMscore,MMalign,pymol-super,pymol-cealign,pymol-align,matrix} Algorithm used for superposition. + "matrix" means input TMalign style matrix. In this case, "native.pdb" + is the matrix file. -execpath=/usr/bin/TMalign Path to executable. default is searching in the same folder of this script and then search in $PATH @@ -393,10 +395,118 @@ def sup_TMalign(model_pdb,native_pdb, algo="TMalign", return (pdb_txt,[float(RMSD)],[float(TMscore)]) +def sup_matrix(model_pdb,matrix_file, offset=0): + '''Superpose "model_pdb" to "native_pdb" using TMalign/TMscore/MMalign + + model_pdb - moved pdb structure + native_pdb - fixed pdb structure + algo - algorithm for supersition {"TMalign","TMscore","MMalign"} + execpath - path to TMalign/TMscore/MMalign executable + offset - add "offset" to residue index of model. + check_nmr - check whether "model_pdb" is multi-model NMR structure + ''' + tmp_dir="/tmp/"+os.getenv("USER")+'/'+algo+ \ + str(random.randint(100000000,999999999))+'/' + while(os.path.isdir(tmp_dir)): + tmp_dir="/tmp/"+os.getenv("USER")+'/'+algo+ \ + str(random.randint(100000000,999999999))+'/' + os.makedirs(tmp_dir) + shutil.copy(model_pdb,tmp_dir+"model.pdb") + shutil.copy(matrix_file,tmp_dir+"matrix.txt") + + # read model pdb + fp=open(tmp_dir+"model.pdb",'rU') + txt=fp.read() + fp.close() + + # reindex residue index + if offset!=0: + lines=txt.splitlines() + txt="" + for line in lines: + if line.startswith("ATOM ") or line.startswith("HETATM"): + resi=str(int(line[22:26])+offset) + line=line[:22]+' '*(4-len(resi))+resi+line[26:] + txt+=line+'\n' + fp=open(tmp_dir+"model.pdb",'w') + fp.write(txt) + fp.close() + + fp=open(tmp_dir+"matrix.txt",'rU') + stdout=fp.read() + fp.close() + + pattern=re.compile("^\s*[0123]"+"\s+[-]{0,1}[.\d]+"*4+"\s*$") + m=-1 + t=[0,0,0] + u=[[0,0,0] for e in range(3)] + for line in stdout.splitlines(): + matchall=pattern.findall(line) + if not matchall: + continue + m+=1 + t[m],u[m][0],u[m][1],u[m][2]=map(float, + re.split("\s+",line.strip())[1:]) + + fp=open(tmp_dir+"model.pdb",'rU') + pdb_lines=fp.read().splitlines() + fp.close() + + if m<=0: + return '\n'.join(pdb_lines)+'\n' + + + ''' Code for rotating Chain_1 from (x,y,z) to (X,Y,Z): + do i=1,L + X(i)=t(1)+u(1,1)*x(i)+u(1,2)*y(i)+u(1,3)*z(i) + Y(i)=t(2)+u(2,1)*x(i)+u(2,2)*y(i)+u(2,3)*z(i) + Z(i)=t(3)+u(3,1)*x(i)+u(3,2)*y(i)+u(3,3)*z(i) + enddo + ''' + pdb_txt='' + for idx,line in enumerate(pdb_lines): + if not line.startswith("ATOM ") and not line.startswith("HETATM"): + pdb_txt+=line+'\n' + continue + x=float(line[30:38]) + y=float(line[38:46]) + z=float(line[46:54]) + X='%.3f'%(t[0]+u[0][0]*x+u[0][1]*y+u[0][2]*z) + Y='%.3f'%(t[1]+u[1][0]*x+u[1][1]*y+u[1][2]*z) + Z='%.3f'%(t[2]+u[2][0]*x+u[2][1]*y+u[2][2]*z) + line=line[:30]+' '*(8-len(X))+X+' '*(8-len(Y))+Y+ \ + ' '*(8-len(Z))+Z+line[54:] + pdb_txt+=line+'\n' + ''' +COLUMNS DATA TYPE CONTENTS +----------------------------------------------------------------------- + 1 - 6 Record name "ATOM " + 7 - 11 Integer Atom serial number. +13 - 16 Atom Atom name. +17 Character Alternate location indicator. +18 - 20 Residue name Residue name. +22 Character Chain identifier. +23 - 26 Integer Residue sequence number. +27 AChar Code for insertion of residues. +31 - 38 Real(8.3) Orthogonal coordinates for X in Angstroms. +39 - 46 Real(8.3) Orthogonal coordinates for Y in Angstroms. +47 - 54 Real(8.3) Orthogonal coordinates for Z in Angstroms. +55 - 60 Real(6.2) Occupancy. +61 - 66 Real(6.2) Temperature factor (Default = 0.0). +73 - 76 LString(4) Segment identifier, left-justified. +77 - 78 LString(2) Element symbol, right-justified. +79 - 80 LString(2) Charge on the atom. + ''' + + if os.path.isdir(tmp_dir): + shutil.rmtree(tmp_dir) + + return pdb_txt + def superpose(model_pdb="mobile.pdb",native_pdb="target.pdb", algo="TMalign", execpath='',writePDB=True,offset=0): '''Superpose "model_pdb" to "native_pdb" ''' - if not execpath: # default path to executables + if not execpath and algo!="matrix": # default path to executables if algo.lower().startswith("pymol") or algo=="super": execpath="pymol" else: @@ -413,6 +523,8 @@ def superpose(model_pdb="mobile.pdb",native_pdb="target.pdb", elif algo.startswith("pymol-"): pdb_txt,RMSD_lst,TMscore_lst=sup_pymol( model_pdb,native_pdb,algo,execpath,offset) + elif algo=="matrix": + pdb_txt=sup_matrix(model_pdb,native_pdb,offset) model_pdb_basename=os.path.basename(model_pdb) ext_idx=model_pdb_basename.rfind('.') From 82780820c1c18a11a174e5b2e5af227c8897ae93 Mon Sep 17 00:00:00 2001 From: zcx Date: Tue, 10 Dec 2019 10:22:12 -0500 Subject: [PATCH 26/32] fix init.dat to model error for unconventional threading program --- initdat2pdb.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/initdat2pdb.py b/initdat2pdb.py index 7854cb4..c9e8e4c 100755 --- a/initdat2pdb.py +++ b/initdat2pdb.py @@ -156,16 +156,17 @@ def convert_initdat_txt(sequence='',initdat_txt='',CONECT=False): template_segment=line[54:59]+ \ code_with_modified_residues[line[60:63]]+" " \ - if line[60:63].strip() else " " + if line[60:63] in code_with_modified_residues else " " target_segment=line[21:26]+ \ residue_name_short+" " #code_with_modified_residues[line[17:20]]+" " template_residue_name=line[60:63] - if not line[60:63].strip(): + if not line[60:63] in code_with_modified_residues: template_residue_name=residue_name template_resi=line[55:59] - if not line[55:59].strip(): + if not line[55:59].strip() or \ + not line[60:63] in code_with_modified_residues: template_resi=residue_sequence_number if template_line and int(template_resi)<=int(template_line[22:26]): template_resi=str(1+int(template_line[22:26])) @@ -216,7 +217,7 @@ def convert_initdat_txt(sequence='',initdat_txt='',CONECT=False): sequence=sequence[:resi]+residue_name_short+sequence[resi+1:] template_sequence+='-'*(resi-len(template_sequence)) - if line[60:63].strip(): + if line[60:63] in code_with_modified_residues: template_sequence=template_sequence[:resi]+ \ code_with_modified_residues[line[60:63]]+template_sequence[resi+1:] else: From baf3bb00bba0ea3c9d13895d18115e1e1ebf3136 Mon Sep 17 00:00:00 2001 From: zcx Date: Tue, 13 Oct 2020 09:15:36 -0400 Subject: [PATCH 27/32] contact_pdb stat for dist --- contact_pdb.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/contact_pdb.py b/contact_pdb.py index 2016078..f074ee3 100755 --- a/contact_pdb.py +++ b/contact_pdb.py @@ -348,6 +348,9 @@ def calc_lnat_acc_dist(cmp_list,con_num_dict,sep_range=str(short_range_def)): )/len(top_pred[key]))**.5 coef_sigma[key]=sum([e[4]/e[5] for e in top_pred[key]] )/len(top_pred[key]) + else: + ACC[key]=0 + coef_sigma[key]=0 return ACC,coef_sigma,top_pred def calc_lnat_acc_contact(cmp_list,con_num_dict,sep_range=str(short_range_def)): @@ -419,6 +422,49 @@ def calc_acc_contact(cmp_list,L,sep_range=str(short_range_def)): ACC[key]=0 # error return ACC,top_pred + +def calc_acc_dist(cmp_list,con_num_dict,sep_range=str(short_range_def)): + '''Calculate residue contact accuracy using ouput if "compare_res_contact" + and native contact number diction "con_num_dict" ''' + top_pred=dict() # top short, medm, long, all prediction + + if not sep_range in ["medium","long"]: + top_pred["short1"]=[res_pair for res_pair in cmp_list if \ + short_range_def<=abs(res_pair[0]-res_pair[1]) Date: Fri, 20 Aug 2021 17:26:34 -0400 Subject: [PATCH 28/32] mixxing chainID in TER --- fetch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fetch.py b/fetch.py index f3bd9a8..b5e6e2e 100755 --- a/fetch.py +++ b/fetch.py @@ -394,8 +394,8 @@ def extract_chain(PDB_file,chainID_list=[], atom_serial_list=[] for line in pdb_lines: section=line[:6].rstrip() - if section in chainID_column and (not chainID_column[section] \ - or line[chainID_column[section]]==chainID): + if section in chainID_column and (not chainID_column[section] or ( + chainID_column[section] Date: Fri, 20 Aug 2021 17:40:38 -0400 Subject: [PATCH 29/32] allow chain ID with up to 4 char --- fetch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fetch.py b/fetch.py index b5e6e2e..443c4aa 100755 --- a/fetch.py +++ b/fetch.py @@ -61,7 +61,7 @@ scop_mirror="http://scop.berkeley.edu/downloads/pdbstyle" pdb_pattern=re.compile("^\d\w{3}$") -pdb_chain_pattern=re.compile("^\d\w{4,5}$") +pdb_chain_pattern=re.compile("^\d\w{4,7}$") accession_pattern=re.compile(# uniprot accession (AC) "^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}$") entry_pattern=re.compile("^\w{1,5}_\w{1,5}") # uniprot entry name @@ -195,8 +195,8 @@ def fetch_bundle(PDBid,no_err=False): # and software tools that rely solely on the PDB file format. PDBid=PDBid.lower() tarball_name=PDBid+"-pdb-bundle.tar.gz" - return wget(pdb_bundle_mirror+PDBid[1:3]+'/'+PDBid+'/'+tarball_name - ,tarball_name,no_err) + url=pdb_bundle_mirror+PDBid[1:3]+'/'+PDBid+'/'+tarball_name + return wget(url, tarball_name,no_err) def extract_chain_from_bundle(tarball_name,chainID_list=[]): '''split best effort/minimal PDB tarball into PDB files containing From cf7990d78be18387e9c8936a81fcd177396a423f Mon Sep 17 00:00:00 2001 From: Chengxin Zhang Date: Fri, 17 Sep 2021 14:52:43 -0400 Subject: [PATCH 30/32] addchainID --- addchainID.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100755 addchainID.py diff --git a/addchainID.py b/addchainID.py new file mode 100755 index 0000000..f56e94d --- /dev/null +++ b/addchainID.py @@ -0,0 +1,23 @@ +#!/usr/bin/python +docstring=''' +addchainID.py input.pdb output.pdb chainID +''' +import sys + +if len(sys.argv)!=4: + sys.stderr.write(docstring) + exit() + +infile =sys.argv[1] +outfile=sys.argv[2] +chainID=sys.argv[3] + +txt='' +fp=open(infile) +for line in fp.read().splitlines(): + if line.startswith("ATOM ") or line.startswith("HETATM"): + txt+=line[:21]+chainID+line[22:]+'\n' +fp.close() +fp=open(outfile,'w') +fp.write(txt) +fp.close() From 194628065151e7615087ad2274265ffa3783d81b Mon Sep 17 00:00:00 2001 From: zcx Date: Fri, 17 Sep 2021 14:54:04 -0400 Subject: [PATCH 31/32] long chain ID --- fetch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fetch.py b/fetch.py index f3bd9a8..877f879 100755 --- a/fetch.py +++ b/fetch.py @@ -61,7 +61,7 @@ scop_mirror="http://scop.berkeley.edu/downloads/pdbstyle" pdb_pattern=re.compile("^\d\w{3}$") -pdb_chain_pattern=re.compile("^\d\w{4,5}$") +pdb_chain_pattern=re.compile("^\d\w{4,7}$") accession_pattern=re.compile(# uniprot accession (AC) "^[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}$") entry_pattern=re.compile("^\w{1,5}_\w{1,5}") # uniprot entry name From 679b93418b942e3abb2a10ed8d4db266b7c550f6 Mon Sep 17 00:00:00 2001 From: Chengxin Zhang Date: Wed, 20 Oct 2021 15:16:03 -0400 Subject: [PATCH 32/32] insertion code in pdb2fasta --- pdb2fasta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdb2fasta.py b/pdb2fasta.py index ef5708a..fec92e6 100755 --- a/pdb2fasta.py +++ b/pdb2fasta.py @@ -248,7 +248,7 @@ def pdbtxt2seq(txt='',infile='pdb.pdb',PERMISSIVE="MSE",outfmt="PDB", # underscore for empty chain identifier chain_id=line[21].replace(' ','_') - res_num=int(line[22:26]) # residue sequence number + res_num=line[22:27] # residue sequence number aa=aa3to1[residue] if residue in aa3to1 else 'X' # one letter AA name if not allowX and aa=='X': continue