diff --git a/.gitpod.yml b/.gitpod.yml new file mode 100644 index 0000000..a565aba --- /dev/null +++ b/.gitpod.yml @@ -0,0 +1,14 @@ +image: nfcore/gitpod:latest + +vscode: + extensions: # based on nf-core.nf-core-extensionpack + - codezombiech.gitignore # Language support for .gitignore files + # - cssho.vscode-svgviewer # SVG viewer + - esbenp.prettier-vscode # Markdown/CommonMark linting and style checking for Visual Studio Code + - eamodio.gitlens # Quickly glimpse into whom, why, and when a line or code block was changed + - EditorConfig.EditorConfig # override user/workspace settings with settings found in .editorconfig files + - Gruntfuggly.todo-tree # Display TODO and FIXME in a tree view in the activity bar + - mechatroner.rainbow-csv # Highlight columns in csv files in different colors + # - nextflow.nextflow # Nextflow syntax highlighting + - oderwat.indent-rainbow # Highlight indentation level + - streetsidesoftware.code-spell-checker # Spelling checker for source code \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ddfafb..70413f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,20 @@ Started only in version 0.5 +## v1.2.3 + +- Added ICEberg and PHAST results to bacannot json summary file + +## v1.2.2 + +- Added a file size check, only enter bacannot summary creation if results files are not empty +- Fixed mob_suite summary parsing (added .astype(str)) +- Moved MGE dict key generation for inside the integron_finder loop + +## v1.2.1 + +- Added a quick fix for plasmid finder results parsing, as results from gram-negative have only one database, but for gram-positive it has >1. + ## v0.5 ### hotfix diff --git a/build_conda.sh b/build_conda.sh index ea63aba..d3d0571 100644 --- a/build_conda.sh +++ b/build_conda.sh @@ -9,7 +9,8 @@ conda convert -p osx-64 $(find build -name "falmeida-py*.tar.bz2") # upload osx anaconda upload $(find osx-64 -name "falmeida-py*.tar.bz2") --force -( cd build && anaconda upload $(find linux-64 -name "falmeida-py*.tar.bz2") --force ) +sleep 140 +anaconda upload $(find build/linux-64 -name "falmeida-py*.tar.bz2") --force # rm dirs rm -rf build osx-64 diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index ff18464..b1e862a 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -1,20 +1,20 @@ package: name: falmeida-py - version: '0.9' + version: '1.2.4' source: path: .. build: number: 0 - script: python setup.py install --single-version-externally-managed --record=record.txt + script: pip install . entry_points: - processrcmfolder = uconnrcmpy.dataprocessing:process_folder requirements: build: - - python >=3.7,{{PY_VER}}* + - python>=3.8 - setuptools - setuptools-git - pandas @@ -26,7 +26,7 @@ requirements: - pyyaml run: - - python {{PY_VER}}* + - python>=3.8 - setuptools - setuptools-git - pandas diff --git a/falmeida_py/__main__.py b/falmeida_py/__main__.py index be6003d..4b94bba 100644 --- a/falmeida_py/__main__.py +++ b/falmeida_py/__main__.py @@ -148,8 +148,8 @@ def main(): print(f"Processing file: {args['--gbk']}!") gbk2fasta(gbk=args['--gbk']) blast(task='blastn', query=args['--fasta'], subject='tmp_gbk.fa', - culling=args['--culling_limit'], minid=args['--minid'], mincov=args['--mincov'], - out='out.blast', threads=1, twoway=None) + culling=args['--culling_limit'], minid=args['--minid'], mincov=args['--mincov'], + out='out.blast', threads=1, twoway=None) filtergbk(gbk=args['--gbk'], out=args['--out'], extension=int(args['--extension'])) # Clean dir @@ -241,8 +241,8 @@ def main(): ####################################### ### Without commands nor parameters ### ####################################### - else: - print(usage.strip()) + elif not arguments['']: + print(usage.strip()) ## Calling main if __name__ == '__main__': diff --git a/falmeida_py/bacannot2json.py b/falmeida_py/bacannot2json.py index 80a342a..b4a8b75 100644 --- a/falmeida_py/bacannot2json.py +++ b/falmeida_py/bacannot2json.py @@ -37,33 +37,25 @@ from .plasmid_function import * from .virulence_function import * from .resistance_function import * +from .mges_function import * ############################## ### fix keys in dictionary ### ############################## +def convert_dictkey(d): + ###change all keys in a dict d + return { str(k): convert_dictvalue(v) for k,v in d.items() } + +def convert_dictvalue(v): + ###if v is a dict do convert_dictkey() for v, else raise v + if isinstance(v, dict): + return convert_dictkey(v) + else: + return v + def stringify_keys(d): """Convert a dict's keys to strings if they are not.""" - for key in d.keys(): - - # check inner dict - if isinstance(d[key], dict): - value = stringify_keys(d[key]) - else: - value = d[key] - - # convert nonstring to string if needed - if not isinstance(key, str): - try: - d[str(key)] = value - except Exception: - try: - d[repr(key)] = value - except Exception: - raise - - # delete old key - del d[key] - return d + return convert_dictkey(d) ############################################### ### based on annotations figure sample name ### @@ -112,6 +104,9 @@ def bacannot2json(indir, outfile, check): # check virulence annotation stats virulence_stats( bacannot_summary ) + # check MGEs annotation stats + mges_stats( bacannot_summary ) + # check plasmids annotation stats plasmids_stats( bacannot_summary ) diff --git a/falmeida_py/general_stats_function.py b/falmeida_py/general_stats_function.py index 74e0893..80e2590 100644 --- a/falmeida_py/general_stats_function.py +++ b/falmeida_py/general_stats_function.py @@ -37,10 +37,10 @@ def general_stats(bacannot_summary): # save annotation stats bacannot_summary[sample]['general_annotation'] = {} bacannot_summary[sample]['general_annotation']['mlst'] = str(mlst_results[2].item()).replace('-', 'null') - bacannot_summary[sample]['general_annotation']['cds'] = general_results['CDS'] - bacannot_summary[sample]['general_annotation']['rrna'] = general_results['rRNA'] - bacannot_summary[sample]['general_annotation']['trna'] = general_results['tRNA'] - bacannot_summary[sample]['general_annotation']['tmrna'] = general_results['tmRNA'] + bacannot_summary[sample]['general_annotation']['cds'] = general_results.get('CDS', 0) + bacannot_summary[sample]['general_annotation']['rrna'] = general_results.get('rRNA', 0) + bacannot_summary[sample]['general_annotation']['trna'] = general_results.get('tRNA', 0) + bacannot_summary[sample]['general_annotation']['tmrna'] = general_results.get('tmRNA', 0) bacannot_summary[sample]['general_annotation']['closest_reference'] = {} bacannot_summary[sample]['general_annotation']['closest_reference']['strain'] = refseq_masher_results.head(1)['top_taxonomy_name'].item() diff --git a/falmeida_py/mges_function.py b/falmeida_py/mges_function.py new file mode 100644 index 0000000..39014e0 --- /dev/null +++ b/falmeida_py/mges_function.py @@ -0,0 +1,169 @@ +################################## +### Loading Necessary Packages ### +################################## +import pandas as pd +import os +from .utils import load_and_subset_gff + +#################################### +### check MGEs annotations stats ### +#################################### +def mges_stats(bacannot_summary): + + # iterate over available samples + for sample in bacannot_summary: + + # load dir of samples' results + results_dir = bacannot_summary[sample]['results_dir'] + + # load gff_file + gff_file = f"{results_dir}/gffs/{sample}.gff" + + # integron_finder + if os.path.exists(f"{results_dir}/integron_finder/{sample}_integrons.gff") and os.stat(f"{results_dir}/integron_finder/{sample}_integrons.gff").st_size > 0: + + # init MGE annotation dictionary + if 'MGE' not in bacannot_summary[sample]: + bacannot_summary[sample]['MGE'] = {} + + # init integron_finder annotation dictionary + bacannot_summary[sample]['MGE']['integron_finder'] = {} + + # load integron_finder results + results = pd.read_csv( + f"{results_dir}/integron_finder/{sample}_integrons.gff", + sep='\t', + header=None, + names=[ + 'chr', 'source', 'type', 'start', 'end', 'score', 'strand', 'frame', 'atts' + ] + ) + + # number of integron_finder annotations + total_number = len(results.index) + bacannot_summary[sample]['MGE']['integron_finder']['total'] = total_number + + # per integron info + bacannot_summary[sample]['MGE']['integron_finder'] = {} + if int(results.shape[0]) > 0: + for seq in [ str(x) for x in results['chr'].unique() ]: + + bacannot_summary[sample]['MGE']['integron_finder'][seq] = {} + for index, row in results[results['chr'] == seq].reset_index().iterrows(): + id = row['atts'].split(';')[0].split('=')[-1] + bacannot_summary[sample]['MGE']['integron_finder'][seq][id] = {} + bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['id'] = id + bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['contig'] = row['chr'] + bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['start'] = row['start'] + bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['end'] = row['end'] + bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['type'] = row['atts'].split(';')[1].split('=')[-1] + bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['source'] = row['source'] + bacannot_summary[sample]['MGE']['integron_finder'][seq][id]['product'] = row['type'] + + # ICEberg database + ice_db_blastp = f"{results_dir}/ICEs/{sample}_iceberg_blastp_onGenes.summary.txt" + if os.path.exists(ice_db_blastp) and os.stat(ice_db_blastp).st_size > 0: + + # init MGE annotation dictionary + if 'MGE' not in bacannot_summary[sample]: + bacannot_summary[sample]['MGE'] = {} + + # init iceberg annotation dictionary + if 'ICE' not in bacannot_summary[sample]['MGE']: + bacannot_summary[sample]['MGE']['ICEberg'] = {} + + # init iceberg blastp annotation dictionary + bacannot_summary[sample]['MGE']['ICEberg']['blastp'] = {} + + # load integron_finder results + results = pd.read_csv( + ice_db_blastp, + sep='\t' + ) + + # load gff + gff = load_and_subset_gff(gff_file, 'source', 'ICEberg') + + # number of integron_finder annotations + total_number = len(results.index) + bacannot_summary[sample]['MGE']['ICEberg']['blastp']['total'] = total_number + + # per gene info + if int(results.shape[0]) > 0: + for seq in [ str(x) for x in results['SEQUENCE'].unique() ]: + + # details missing in output but available in gff + gff_row = gff[gff['attributes'].str.contains(seq)] + contig = gff_row['seq'].item() + start = gff_row['start'].item() + end = gff_row['end'].item() + + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq] = {} + for index, row in results[results['SEQUENCE'] == seq].reset_index().iterrows(): + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq] = {} + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['id'] = row['ICEBERG_ID'] + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['contig'] = contig + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['start'] = start + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['end'] = end + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['accession'] = row['ACCESSION'] + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['product'] = row['PRODUCT'] + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['description'] = row['DESCRIPTION'] + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['blast_start'] = row['START'] + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['blast_end'] = row['END'] + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['blast_identity'] = row['%IDENTITY'] + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['blast_coverage'] = row['%COVERAGE'] + bacannot_summary[sample]['MGE']['ICEberg']['blastp'][seq]['strand'] = row['STRAND'] + + # PHAST database + phast_db_blastp = f"{results_dir}/prophages/phast_db/{sample}_phast_blastp_onGenes.summary.txt" + if os.path.exists(phast_db_blastp) and os.stat(phast_db_blastp).st_size > 0: + + # init MGE annotation dictionary + if 'MGE' not in bacannot_summary[sample]: + bacannot_summary[sample]['MGE'] = {} + + # init phast annotation dictionary + if 'ICE' not in bacannot_summary[sample]['MGE']: + bacannot_summary[sample]['MGE']['PHAST'] = {} + + # init phast blastp annotation dictionary + bacannot_summary[sample]['MGE']['PHAST']['blastp'] = {} + + # load integron_finder results + results = pd.read_csv( + phast_db_blastp, + sep='\t' + ) + + # load gff + gff = load_and_subset_gff(gff_file, 'source', 'PHAST') + + # number of integron_finder annotations + total_number = len(results.index) + bacannot_summary[sample]['MGE']['PHAST']['blastp']['total'] = total_number + + # per gene info + if int(results.shape[0]) > 0: + for seq in [ str(x) for x in results['SEQUENCE'].unique() ]: + + # details missing in output but available in gff + gff_row = gff[gff['attributes'].str.contains(seq)] + contig = gff_row['seq'].item() + start = gff_row['start'].item() + end = gff_row['end'].item() + + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq] = {} + for index, row in results[results['SEQUENCE'] == seq].reset_index().iterrows(): + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq] = {} + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['id'] = row['PHAST_ID'] + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['contig'] = contig + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['start'] = start + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['end'] = end + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['accession'] = row['ACCESSION'] + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['gene'] = row['GENE'] + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['description'] = row['DESCRIPTION'] + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['blast_start'] = row['START'] + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['blast_end'] = row['END'] + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['blast_identity'] = row['%IDENTITY'] + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['blast_coverage'] = row['%COVERAGE'] + bacannot_summary[sample]['MGE']['PHAST']['blastp'][seq]['strand'] = row['STRAND'] \ No newline at end of file diff --git a/falmeida_py/plasmid_function.py b/falmeida_py/plasmid_function.py index aab1e37..789aa9a 100644 --- a/falmeida_py/plasmid_function.py +++ b/falmeida_py/plasmid_function.py @@ -2,11 +2,7 @@ ### Loading Necessary Packages ### ################################## import pandas as pd -from .utils import find_files -import json import os -import yaml -from pathlib import Path ######################################## ### check plasmids annotations stats ### @@ -19,29 +15,27 @@ def plasmids_stats(bacannot_summary): # load dir of samples' results results_dir = bacannot_summary[sample]['results_dir'] - # load annotation stats - if os.path.exists(f"{results_dir}/plasmids"): - - # init plasmids annotation dictionary - bacannot_summary[sample]['plasmid'] = {} + # init plasmids annotation dictionary + bacannot_summary[sample]['plasmid'] = {} - # platon - if os.path.exists(f"{results_dir}/plasmids/platon/{sample}.tsv"): + # platon + if os.path.exists(f"{results_dir}/plasmids/platon/{sample}.tsv") and os.stat(f"{results_dir}/plasmids/platon/{sample}.tsv").st_size > 0: - # init platon annotation dictionary - bacannot_summary[sample]['plasmid']['platon'] = {} + # init platon annotation dictionary + bacannot_summary[sample]['plasmid']['platon'] = {} - # load platon results - results = pd.read_csv( - f"{results_dir}/plasmids/platon/{sample}.tsv", - sep='\t' - ) + # load platon results + results = pd.read_csv( + f"{results_dir}/plasmids/platon/{sample}.tsv", + sep='\t' + ) - # number of plasmid annotations - total_number = len(results.index) - bacannot_summary[sample]['plasmid']['platon']['total'] = total_number + # number of plasmid annotations + total_number = len(results.index) + bacannot_summary[sample]['plasmid']['platon']['total'] = total_number - # per plasmid info + # per plasmid info + if int(results.shape[0]) > 0: for seq in [x for x in results['ID'].unique()]: bacannot_summary[sample]['plasmid']['platon'][seq] = {} bacannot_summary[sample]['plasmid']['platon'][seq]['Length'] = results.loc[results['ID'] == seq, 'Length'].item() @@ -51,37 +45,89 @@ def plasmids_stats(bacannot_summary): bacannot_summary[sample]['plasmid']['platon'][seq]['Replication'] = results.loc[results['ID'] == seq, '# Replication'].item() bacannot_summary[sample]['plasmid']['platon'][seq]['Mobilization'] = results.loc[results['ID'] == seq, '# Mobilization'].item() bacannot_summary[sample]['plasmid']['platon'][seq]['Conjugation'] = results.loc[results['ID'] == seq, '# Conjugation'].item() - - # plasmidfinder - if os.path.exists(f"{results_dir}/plasmids/plasmidfinder/results_tab.tsv"): - - # init platon annotation dictionary - bacannot_summary[sample]['plasmid']['plasmidfinder'] = {} - - # load platon results - results = pd.read_csv( - f"{results_dir}/plasmids/plasmidfinder/results_tab.tsv", - sep='\t' - ) - - if not results.empty: - - # databases - bacannot_summary[sample]['plasmid']['plasmidfinder']['meta'] = {} - bacannot_summary[sample]['plasmid']['plasmidfinder']['meta']['database'] = results['Database'].unique().item() - - # number of plasmid annotations - total_number = len(results['Contig'].unique()) - bacannot_summary[sample]['plasmid']['plasmidfinder']['total'] = total_number - - # plasmid annotations contigs - for seq in [ str(x) for x in results['Contig'].unique() ]: - bacannot_summary[sample]['plasmid']['plasmidfinder'][seq] = {} - bacannot_summary[sample]['plasmid']['plasmidfinder'][seq]['inc_types'] = {} - bacannot_summary[sample]['plasmid']['plasmidfinder'][seq]['identity'] = {} - bacannot_summary[sample]['plasmid']['plasmidfinder'][seq]['accession'] = {} - for index, row in results.iterrows(): - contig = row['Contig'] - bacannot_summary[sample]['plasmid']['plasmidfinder'][contig]['inc_types'] = row['Plasmid'] - bacannot_summary[sample]['plasmid']['plasmidfinder'][contig]['identity'] = row['Identity'] - bacannot_summary[sample]['plasmid']['plasmidfinder'][contig]['accession'] = row['Accession number'] \ No newline at end of file + + # plasmidfinder + if os.path.exists(f"{results_dir}/plasmids/plasmidfinder/results_tab.tsv") and os.stat(f"{results_dir}/plasmids/plasmidfinder/results_tab.tsv").st_size > 0: + + # init platon annotation dictionary + bacannot_summary[sample]['plasmid']['plasmidfinder'] = {} + + # load platon results + results = pd.read_csv( + f"{results_dir}/plasmids/plasmidfinder/results_tab.tsv", + sep='\t' + ) + + if not results.empty: + + # databases + print( results['Database'].unique() ) + bacannot_summary[sample]['plasmid']['plasmidfinder']['meta'] = {} + db_arr = results['Database'].unique() + bacannot_summary[sample]['plasmid']['plasmidfinder']['meta']['database'] = db_arr.tolist() if len(db_arr) > 1 else db_arr.item() + + # number of plasmid annotations + total_number = len(results['Contig'].unique()) + bacannot_summary[sample]['plasmid']['plasmidfinder']['total'] = total_number + + # plasmid annotations contigs + for seq in [ str(x) for x in results['Contig'].unique() ]: + bacannot_summary[sample]['plasmid']['plasmidfinder'][seq] = {} + bacannot_summary[sample]['plasmid']['plasmidfinder'][seq]['inc_types'] = {} + bacannot_summary[sample]['plasmid']['plasmidfinder'][seq]['identity'] = {} + bacannot_summary[sample]['plasmid']['plasmidfinder'][seq]['accession'] = {} + + for index, row in results.iterrows(): + contig = str(row['Contig']) + bacannot_summary[sample]['plasmid']['plasmidfinder'][contig]['inc_types'] = row['Plasmid'] + bacannot_summary[sample]['plasmid']['plasmidfinder'][contig]['identity'] = row['Identity'] + bacannot_summary[sample]['plasmid']['plasmidfinder'][contig]['accession'] = row['Accession number'] + + # mob suite + if os.path.exists(f"{results_dir}/plasmids/mob_suite/{sample}_mobtyper_results.txt") and os.stat(f"{results_dir}/plasmids/mob_suite/{sample}_mobtyper_results.txt").st_size > 0: + + # init integron_finder annotation dictionary + bacannot_summary[sample]['plasmid']['mob_suite'] = {} + + # load integron_finder results + results = pd.read_csv( + f"{results_dir}/plasmids/mob_suite/{sample}_mobtyper_results.txt", + sep='\t', + header='infer', + # sample_id num_contigs size gc md5 rep_type(s) rep_type_accession(s) relaxase_type(s) relaxase_type_accession(s) mpf_type mpf_type_accession(s) orit_type(s) orit_accession(s) predicted_mobility mash_nearest_neighbor mash_neighbor_distance mash_neighbor_identification primary_cluster_id secondary_cluster_id predicted_host_range_overall_rank predicted_host_range_overall_name observed_host_range_ncbi_rank observed_host_range_ncbi_name reported_host_range_lit_rank reported_host_range_lit_name associated_pmid(s) + ) + + # number of plasmid types annotated annotations + # total_number = len(results.index) - 1 # always counts chromosome + # bacannot_summary[sample]['plasmid']['mob_suite']['total'] = total_number + + # per integron info + bacannot_summary[sample]['plasmid']['mob_suite'] = {} + if int(results.shape[0]) > 0: + for seq in [ str(x) for x in results['sample_id'].unique() ]: + + bacannot_summary[sample]['plasmid']['mob_suite'][seq] = {} + for index, row in results[results['sample_id'].astype(str) == seq].reset_index().iterrows(): + id = row['sample_id'] # they are the same for this result + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id] = {} + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['size'] = row['size'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['rep_type'] = row['rep_type(s)'].replace(',', '; ') + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['rep_type_accession'] = row['rep_type_accession(s)'].replace(',', '; ') + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['relaxase_type'] = row['relaxase_type(s)'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['relaxase_type_accession(s)'] = row['relaxase_type_accession(s)'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['mpf_type'] = row['mpf_type'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['mpf_type_accession'] = row['mpf_type_accession(s)'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['orit_type'] = row['orit_type(s)'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['orit_accession'] = row['orit_accession(s)'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['mash_nearest_neighbor'] = row['mash_nearest_neighbor'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['mash_neighbor_distance'] = row['mash_neighbor_distance'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['mash_neighbor_identification'] = row['mash_neighbor_identification'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['primary_cluster_id'] = row['primary_cluster_id'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['secondary_cluster_id'] = row['secondary_cluster_id'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['predicted_host_range_overall_rank'] = row['predicted_host_range_overall_rank'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['predicted_host_range_overall_name'] = row['predicted_host_range_overall_name'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['observed_host_range_ncbi_rank'] = row['observed_host_range_ncbi_rank'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['observed_host_range_ncbi_name'] = row['observed_host_range_ncbi_name'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['reported_host_range_lit_rank'] = row['reported_host_range_lit_rank'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['reported_host_range_lit_name'] = row['reported_host_range_lit_name'] + bacannot_summary[sample]['plasmid']['mob_suite'][seq][id]['associated_pmid'] = row['associated_pmid(s)'] \ No newline at end of file diff --git a/falmeida_py/resistance_function.py b/falmeida_py/resistance_function.py index 7a4460a..7f7ff01 100644 --- a/falmeida_py/resistance_function.py +++ b/falmeida_py/resistance_function.py @@ -33,7 +33,7 @@ def resistance_stats(bacannot_summary): ########### ### rgi ### ########### - if os.path.exists(f"{results_dir}/resistance/RGI/RGI_{sample}.txt"): + if os.path.exists(f"{results_dir}/resistance/RGI/RGI_{sample}.txt") and os.stat(f"{results_dir}/resistance/RGI/RGI_{sample}.txt").st_size > 0: # init rgi annotation dictionary bacannot_summary[sample]['resistance']['rgi'] = {} @@ -84,7 +84,7 @@ def resistance_stats(bacannot_summary): ##################### ### amrfinderplus ### ##################### - if os.path.exists(f"{results_dir}/resistance/AMRFinderPlus/AMRFinder_resistance-only.tsv"): + if os.path.exists(f"{results_dir}/resistance/AMRFinderPlus/AMRFinder_resistance-only.tsv") and os.stat(f"{results_dir}/resistance/AMRFinderPlus/AMRFinder_resistance-only.tsv").st_size > 0: # init amrfinderplus annotation dictionary bacannot_summary[sample]['resistance']['amrfinderplus'] = {} @@ -130,10 +130,13 @@ def resistance_stats(bacannot_summary): # # check for rgi orthologies # - if gene in bacannot_summary[sample]['resistance']['rgi']: - bacannot_summary[sample]['resistance']['amrfinderplus'][gene]['card_aro'] = bacannot_summary[sample]['resistance']['rgi'][gene]['card_aro'] - else: - bacannot_summary[sample]['resistance']['amrfinderplus'][gene]['card_aro'] = None + try: + if gene in bacannot_summary[sample]['resistance']['rgi']: + bacannot_summary[sample]['resistance']['amrfinderplus'][gene]['card_aro'] = bacannot_summary[sample]['resistance']['rgi'][gene]['card_aro'] + else: + bacannot_summary[sample]['resistance']['amrfinderplus'][gene]['card_aro'] = None + except: + bacannot_summary[sample]['resistance']['amrfinderplus'][gene]['card_aro'] = None ################# ### resfinder ### @@ -141,7 +144,7 @@ def resistance_stats(bacannot_summary): # # TODO: Include genomic coordinates info # - if os.path.exists(f"{results_dir}/resistance/resfinder/ResFinder_results_tab.txt"): + if os.path.exists(f"{results_dir}/resistance/resfinder/ResFinder_results_tab.txt") and os.stat(f"{results_dir}/resistance/resfinder/ResFinder_results_tab.txt").st_size > 0: # init resfinder annotation dictionary bacannot_summary[sample]['resistance']['resfinder'] = {} @@ -177,7 +180,10 @@ def resistance_stats(bacannot_summary): # # check for rgi orthologies # - if gene in bacannot_summary[sample]['resistance']['rgi']: - bacannot_summary[sample]['resistance']['resfinder'][gene]['card_aro'] = bacannot_summary[sample]['resistance']['rgi'][gene]['card_aro'] - else: - bacannot_summary[sample]['resistance']['resfinder'][gene]['card_aro'] = None + try: + if gene in bacannot_summary[sample]['resistance']['rgi']: + bacannot_summary[sample]['resistance']['resfinder'][gene]['card_aro'] = bacannot_summary[sample]['resistance']['rgi'][gene]['card_aro'] + else: + bacannot_summary[sample]['resistance']['resfinder'][gene]['card_aro'] = None + except: + bacannot_summary[sample]['resistance']['resfinder'][gene]['card_aro'] = None diff --git a/falmeida_py/version.py b/falmeida_py/version.py index 5a92447..e524e2d 100644 --- a/falmeida_py/version.py +++ b/falmeida_py/version.py @@ -14,7 +14,7 @@ If not, see . """ -__version__ = '0.9' +__version__ = '1.2.4' def get_version(): return __version__ diff --git a/falmeida_py/virulence_function.py b/falmeida_py/virulence_function.py index 8656e30..91e912d 100644 --- a/falmeida_py/virulence_function.py +++ b/falmeida_py/virulence_function.py @@ -30,7 +30,7 @@ def virulence_stats(bacannot_summary): bacannot_summary[sample]['virulence'] = {} # vfdb - if os.path.exists(f"{results_dir}/virulence/vfdb/{sample}_vfdb_blastn_onGenes.summary.txt"): + if os.path.exists(f"{results_dir}/virulence/vfdb/{sample}_vfdb_blastn_onGenes.summary.txt") and os.stat(f"{results_dir}/virulence/vfdb/{sample}_vfdb_blastn_onGenes.summary.txt").st_size > 0: # init VFDB annotation dictionary bacannot_summary[sample]['virulence']['VFDB'] = {} @@ -73,7 +73,7 @@ def virulence_stats(bacannot_summary): bacannot_summary[sample]['virulence']['VFDB'][gene]['end'] = end # victors - if os.path.exists(f"{results_dir}/virulence/victors/{sample}_victors_blastp_onGenes.summary.txt"): + if os.path.exists(f"{results_dir}/virulence/victors/{sample}_victors_blastp_onGenes.summary.txt") and os.stat(f"{results_dir}/virulence/victors/{sample}_victors_blastp_onGenes.summary.txt").st_size > 0: # init victors annotation dictionary gff = bacannot_summary[sample]['virulence']['Victors'] = {} diff --git a/requirements.txt b/requirements.txt index baf0574..eff09bb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,4 @@ tabulate biopython simplejson importlib_metadata -yaml +pyyaml diff --git a/setup.py b/setup.py index 0535747..7b2d7ed 100644 --- a/setup.py +++ b/setup.py @@ -30,18 +30,20 @@ def readme(): with open('requirements.txt') as f: required = f.read().splitlines() -setup(name='falmeida-py', - version=__version__, - description='falmeida-py: a package to the simple distribution of my custom scripts.', - long_description=readme(), - long_description_content_type='text/markdown', - url='https://github.com/fmalmeida/pythonScripts', - author='Felipe Almeida', - author_email='almeidafmarques@gmail.com', - license='GPLv3', - packages=['falmeida_py'], - install_requires=required, - entry_points={"console_scripts": ['falmeida-py = falmeida_py.__main__:main']}, - include_package_data=True, - zip_safe=False, - python_requires='>=3.6') +setup( + name='falmeida-py', + version=__version__, + description='falmeida-py: a package to the simple distribution of my custom scripts.', + long_description=readme(), + long_description_content_type='text/markdown', + url='https://github.com/fmalmeida/pythonScripts', + author='Felipe Almeida', + author_email='almeidafmarques@gmail.com', + license='GPLv3', + packages=['falmeida_py'], + install_requires=required, + entry_points={"console_scripts": ['falmeida-py = falmeida_py.__main__:main']}, + include_package_data=True, + zip_safe=False, + python_requires='>=3.6' +)