diff --git a/chython/files/libinchi/wrapper.py b/chython/files/libinchi/wrapper.py index 8d583fb6..fb269dcc 100644 --- a/chython/files/libinchi/wrapper.py +++ b/chython/files/libinchi/wrapper.py @@ -533,7 +533,7 @@ class INCHIStructure(Structure): elif platform.startswith('macosx') and platform.endswith('x86_64'): opt_flag = '-' libname = 'libinchi.dynlib' -elif platform.startswith('macosx') and platform.endswith('arm64'): +elif platform.startswith('macosx'): opt_flag = '-' libname = 'libinchi_arm64.dylib' else: diff --git a/chython/reactor/base.py b/chython/reactor/base.py index a7fba7fe..b357669c 100644 --- a/chython/reactor/base.py +++ b/chython/reactor/base.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2014-2025 Ramil Nugmanov +# Copyright 2014-2026 Ramil Nugmanov # Copyright 2019 Adelia Fatykhova # This file is part of chython. # @@ -96,8 +96,10 @@ def _patcher(self, structure: MoleculeContainer, mapping): a.is_radical = ra.is_radical if ra.stereo is not None: # override stereo a._stereo = ra.stereo + a._extended_stereo = sa.extended_stereo # stereo swap doesn't resolve uncertainty elif sa.stereo is not None: # keep original stereo stereo_atoms.append(m) # mark for stereo fix + a._extended_stereo = sa._extended_stereo # masked by property if retranslation fails else: raise ValueError("AnyElement doesn't match to pattern") else: # QueryElement or Element @@ -111,6 +113,7 @@ def _patcher(self, structure: MoleculeContainer, mapping): if isinstance(ra, Element): a._implicit_hydrogens = ra.implicit_hydrogens # keep H count from patch a.xy = ra.xy # keep coordinates from patch + a._extended_stereo = ra._extended_stereo # propagate template's extended stereo elif ra.implicit_hydrogens: # keep H count from patch a._implicit_hydrogens = ra.implicit_hydrogens[0] else: # existing atoms @@ -118,8 +121,14 @@ def _patcher(self, structure: MoleculeContainer, mapping): a.xy = sa.xy # preserve existing coordinates if ra.stereo is not None: a._stereo = ra.stereo + # template's extended stereo takes priority (e.g. reaction giving racemic product) + if isinstance(ra, Element) and ra.extended_stereo is not None: + a._extended_stereo = ra.extended_stereo + else: # stereo swap doesn't resolve uncertainty + a._extended_stereo = sa.extended_stereo elif sa.stereo is not None: # keep original stereo stereo_atoms.append(m) + a._extended_stereo = sa._extended_stereo natoms[m] = a nbonds[m] = {} @@ -144,6 +153,7 @@ def _patcher(self, structure: MoleculeContainer, mapping): for n, sa in satoms.items(): # add unmatched or masked atoms if n not in patched_atoms and n not in to_delete: natoms[n] = a = sa.copy(hydrogens=True) + a._extended_stereo = sa._extended_stereo # preserve for atoms outside RC nbonds[n] = {} if sa.stereo is not None: # in case of allenes label can disappear/change, thus, requires recalculation diff --git a/chython/reactor/test/test_transformer.py b/chython/reactor/test/test_transformer.py index c6b57b9c..bc157371 100644 --- a/chython/reactor/test/test_transformer.py +++ b/chython/reactor/test/test_transformer.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2025 Ramil Nugmanov +# Copyright 2025, 2026 Ramil Nugmanov # This file is part of chython. # # chython is free software; you can redistribute it and/or modify @@ -38,3 +38,41 @@ def test_transformer(pattern, replacement, source, result): mol = smiles(source) out = {format(smiles(x), 'h') for x in ([result] if isinstance(result, str) else result)} assert {format(x, 'h') for x in transformer(mol)} == out + + +# Extended stereo test cases. +# +# Extended stereo encodes epistemic state (what we know about the mixture), not geometry: +# AND group (+N): racemic mixture — all centers in the group share relative config +# OR group (-N): pure compound, unknown absolute config at each center +# +# Rules implemented in _patcher: +# 1. Atoms outside RC: extended stereo always preserved +# 2. Template overrides stereo (swap/inversion): uncertainty is unchanged, preserve reactant's ext stereo +# 3. Stereo lost (center destroyed or retranslation fails): ext stereo dropped (masked by property) +# 4. MoleculeContainer template with extended stereo: template's label takes priority + +ext_data = [ + # AND group outside RC preserved + ('[C:1]Br', '[A:1][O;M]', 'C[C@H](CC)CBr |&1:1|', '[C@H](CO)(CC)C |&1:0|'), + # OR group outside RC preserved + ('[C:1]Br', '[A:1][O;M]', 'C[C@H](CC)CBr |o1:1|', '[C@H](CO)(CC)C |o1:0|'), + # AND group in RC, stereo dropped + ('[C:1]Br', '[A:1][O;M]', 'CC[C@@H](C)Br |&1:2|', 'CCC(C)O'), + # AND group in RC, stereo inverted — swap doesn't resolve uncertainty + ('[C;M][C;@;h1:1]([O;M])[N;M]', '[A;@@:1]', 'CC[C@H](O)N |&1:2|', 'O[C@@H](N)CC |&1:1|'), + # OR group in RC, stereo inverted — swap doesn't resolve uncertainty + ('[C;M][C;@;h1:1]([O;M])[N;M]', '[A;@@:1]', 'CC[C@H](O)N |o1:2|', 'O[C@@H](N)CC |o1:1|'), + # two AND members: one loses stereo in RC, other outside RC keeps label + ('[C:1]Br', '[A:1][N;M]', 'C[C@H](F)[C@H](O)Br |&1:1,3|', 'OC(N)[C@H](C)F |&1:3|'), + # multiple independent AND/OR groups preserved + ('[C:1]Br', '[A:1][O;M]', '[C@H](F)(O)[C@@H](Cl)CBr |&1:0,o1:3|', + 'O[C@@H]([C@@H](Cl)CO)F |&1:1,o1:2|'), +] + + +@mark.parametrize('pattern, replacement, source, expected', ext_data) +def test_extended_stereo(pattern, replacement, source, expected): + t = Transformer(smarts(pattern), smarts(replacement)) + p = next(iter(t(smiles(source)))) + assert p == smiles(expected) diff --git a/doc/conf.py b/doc/conf.py index b98ffc2b..d4588e2d 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -1,41 +1,50 @@ # -*- coding: utf-8 -*- from os.path import abspath from sys import path + parent = abspath('..') if parent not in path: path.insert(0, parent) -from chython.periodictable import C, QueryC, ListElement, AnyElement, AnyMetal +project = 'chython' author = 'Dr. Ramil Nugmanov' -copyright = '2014-2023, Dr. Ramil Nugmanov ' +copyright = '2014-2026, Dr. Ramil Nugmanov' version = '1.x' -project = 'chython' -needs_sphinx = '1.8' -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.autosummary', 'nbsphinx'] +needs_sphinx = '7.0' +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.viewcode', +] -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints'] -templates_path = ['_templates'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'tutorial'] source_suffix = '.rst' master_doc = 'index' - language = 'en' -pygments_style = 'sphinx' -todo_include_todos = False -autoclass_content = 'both' +pygments_style = 'default' +pygments_dark_style = 'monokai' +# -- Theme ------------------------------------------------------------------- +html_theme = 'furo' +html_title = 'chython' html_logo = 'logo256.png' html_favicon = 'logo256.png' -html_theme_options = {'github_user': 'chython', 'github_repo': 'chython', 'show_related': True} -html_show_copyright = True html_show_sourcelink = False -html_sidebars = { - '**': [ - 'about.html', - 'navigation.html', - 'relations.html', # needs 'show_related': True theme option to display - 'searchbox.html', - ] -} +html_show_copyright = True -nbsphinx_execute = 'never' +html_theme_options = { + 'sidebar_hide_name': True, + 'navigation_with_keys': True, + 'source_repository': 'https://github.com/chython/chython', + 'source_branch': 'master', + 'source_directory': 'doc/', + 'light_css_variables': { + 'color-brand-primary': '#2962ff', + 'color-brand-content': '#2962ff', + }, + 'dark_css_variables': { + 'color-brand-primary': '#82b1ff', + 'color-brand-content': '#82b1ff', + }, +} diff --git a/doc/config.rst b/doc/config.rst new file mode 100644 index 00000000..f5e5d4db --- /dev/null +++ b/doc/config.rst @@ -0,0 +1,80 @@ +Configuration & Integrations +============================ + +Global settings, RDKit interoperability, 3D conformers, and pandas support. + + +Configuration Reference +----------------------- + +.. code-block:: python + + import chython + + # 2D layout engine + chython.clean2d_engine = 'smilesdrawer' # default + # Options: 'rdkit', 'smilesdrawer', 'cdk', 'obabel', 'indigo' + + # 3D conformer engine + chython.conformer_engine = 'rdkit' # default + # Options: 'rdkit', 'cdpkit' + + # Neural AAM device (set before first reset_mapping call) + chython.torch_device = 'cpu' # default; 'cuda:0' for GPU + + # Java JAR paths (CDK, OPSIN) + chython.class_paths = ['/path/to/cdk.jar', '/path/to/opsin.jar'] + # Or via environment variables: CDK_PATH, OPSIN_PATH + + +RDKit Interoperability +----------------------- + +.. code-block:: python + + from chython import smiles, MoleculeContainer + + mol = smiles('c1ccccc1') + + # Convert to RDKit Mol + rdkit_mol = mol.to_rdkit() + + # Convert from RDKit Mol + mol_back = MoleculeContainer.from_rdkit(rdkit_mol) + + +3D Conformers +------------- + +.. code-block:: python + + import chython + chython.conformer_engine = 'rdkit' # or 'cdpkit' + + mol = smiles('CCO') + + # Generate conformer (stored internally) + # Access via mol._conformers after generation + + # Write 3D coordinates to SDF + from chython import SDFWrite + with SDFWrite('output_3d.sdf') as writer: + writer.write(mol, write3d=0) # conformer index + + +Pandas Integration +------------------ + +.. code-block:: python + + import pandas as pd + from chython import smiles, patch_pandas + + # Call once to enable molecule display in DataFrames + patch_pandas() + + df = pd.DataFrame({ + 'mol': [smiles('CCO'), smiles('c1ccccc1')], + 'name': ['ethanol', 'benzene'], + }) + # Molecules display correctly in DataFrame diff --git a/doc/containers.rst b/doc/containers.rst deleted file mode 100644 index 7022ff85..00000000 --- a/doc/containers.rst +++ /dev/null @@ -1,10 +0,0 @@ -chython\.containers package -=========================== - -Data classes. - -.. automodule:: chython.containers - :members: - :exclude-members: CGRContainer - :undoc-members: - :inherited-members: diff --git a/doc/depiction.rst b/doc/depiction.rst new file mode 100644 index 00000000..dade5958 --- /dev/null +++ b/doc/depiction.rst @@ -0,0 +1,89 @@ +Depiction +========= + +SVG rendering, 2D coordinate generation, and display settings. + + +SVG Output +---------- + +.. code-block:: python + + from chython import smiles + + mol = smiles('c1ccccc1O') + + # Generate SVG (auto-calculates 2D coords if needed) + svg = mol.depict() + + # With explicit size + svg = mol.depict(width='10cm', height='10cm') + + # PNG (requires pyppeteer) + png = mol.depict(format='png', png_width=1000, png_heigh=1000) + + # Compressed SVG + svgz = mol.depict(format='svgz') + + # Jupyter notebook: molecules render automatically via _repr_svg_ + + # Reactions too + rxn = smiles('[CH3:1][OH:2]>>[CH3:1][NH2:3]') + svg = rxn.depict() + + +2D Coordinate Generation +------------------------- + +.. code-block:: python + + import chython + + # Set engine globally (before calling clean2d) + chython.clean2d_engine = 'smilesdrawer' # default, built-in + # Other options: 'rdkit', 'cdk', 'obabel', 'indigo' + + mol = smiles('c1ccccc1') + mol.clean2d() # generates 2D coordinates + + +Depiction Settings +------------------ + +.. code-block:: python + + from chython import depict_settings + + depict_settings( + carbon=False, # hide C labels (default) + aam=True, # show atom-atom mapping + monochrome=False, # use CPK colors + bond_color='black', + font_size=0.5, + bond_width=0.04, + ) + + # Restore defaults + depict_settings() + +After changing settings, flush cached depictions: + +.. code-block:: python + + mol.flush_cache() + svg = mol.depict() + + +Grid Depiction +-------------- + +.. code-block:: python + + from chython import grid_depict, smiles + + mols = [smiles('CCO'), smiles('c1ccccc1'), smiles('CC(=O)O')] + svg = grid_depict(mols, cols=3) + + # In Jupyter, wrap with ipywidgets for display + from ipywidgets import HTML + HTML(grid_depict(mols, cols=3)) diff --git a/doc/files.rst b/doc/files.rst deleted file mode 100644 index 29390728..00000000 --- a/doc/files.rst +++ /dev/null @@ -1,8 +0,0 @@ -chython\.files package -====================== - -Available file parsers and writers: - -.. automodule:: chython.files - :members: - :undoc-members: diff --git a/doc/index.rst b/doc/index.rst index fd2a52ab..d5b97cfc 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -1,34 +1,45 @@ -.. include:: ../README.rst +Chython +======= -Chython package API -=================== +Library for processing molecules and reactions in Python. -chython. **pickle_cache** = True - Store cached attributes in pickle. Effective for multiprocessing. +.. code-block:: bash -chython. **torch_device** = 'cpu' - Atom-to-Atom mapping model device in torch notation. Change before first `reset_mapping` call! + pip install chython -.. automodule:: chython - :members: smiles, inchi, xyz, mdl_mol, smarts, depict_settings - :undoc-members: - :show-inheritance: +**Key capabilities:** -Subpackages ------------ +- Parse and write SMILES, InChI, IUPAC, MDL (SDF/RDF/MOL), MRV, XYZ, PDB +- Substructure and MCS search with SMARTS support +- Standardize, canonicalize, and enumerate tautomers +- Morgan and linear fingerprints with Tanimoto similarity +- Atom-to-atom mapping (neural + rule-based) +- Template-based reaction application and deprotection +- Stereo handling (tetrahedral, allene, cis-trans) +- 2D/3D depiction with Jupyter support +- RDKit interoperability + + +Cookbook +-------- .. toctree:: - :maxdepth: 4 + :maxdepth: 2 - containers - files - reactor - utils - periodictable + io + molecule + standardize + substructure + reactions + depiction + config -Notebooks -========= -.. toctree:: - :caption: Tutorial - :maxdepth: 1 +Links +------ + +- `Source code `_ +- `PyPI `_ +- `Issues `_ - tutorial/notebook.ipynb +Chython is a fork of `CGRtools `_. diff --git a/doc/io.rst b/doc/io.rst new file mode 100644 index 00000000..c3ba9c55 --- /dev/null +++ b/doc/io.rst @@ -0,0 +1,389 @@ +Input / Output +============== + +Reading and writing molecules and reactions in all supported formats. + + +String Parsers +-------------- + +SMILES +~~~~~~ + +.. code-block:: python + + from chython import smiles + + mol = smiles('CCO') # ethanol + mol = smiles('c1ccccc1') # benzene (aromatic) + mol = smiles('[Cu+2]') # copper ion + mol = smiles('C/C=C/C') # trans-2-butene (with stereo) + + # Reaction SMILES + rxn = smiles('[CH3:1][OH:2]>>[CH3:1][NH2:3]') + + +InChI +~~~~~ + +.. code-block:: python + + from chython import inchi + + mol = inchi('InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3') + + +IUPAC Name +~~~~~~~~~~ + +Requires OPSIN JAR. Set path via ``OPSIN_PATH`` env variable or ``chython.class_paths[1]``. + +.. code-block:: python + + from chython import iupac + + mol = iupac('acetic acid') + mol = iupac('2-acetoxybenzoic acid') + + +MDL MOL Block +~~~~~~~~~~~~~ + +.. code-block:: python + + from chython import mdl_mol + + mol_block = """ + Mrv2211 03232310102D + + 3 2 0 0 0 0 999 V2000 + 0.0000 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 1.5400 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + 3.0800 0.0000 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 + 1 2 1 0 0 0 0 + 2 3 1 0 0 0 0 + M END + """ + mol = mdl_mol(mol_block) + + +XYZ Coordinates +~~~~~~~~~~~~~~~ + +.. code-block:: python + + from chython import xyz + + mol = xyz((('O', 0., 0., 0.), ('H', 1., 0., 0.), ('H', 0., 1., 0.))) + mol.clean2d() + + +SMARTS +~~~~~~ + +Parses SMARTS into ``QueryContainer`` for substructure matching. See :doc:`substructure` for full SMARTS syntax. + +.. code-block:: python + + from chython import smarts + + q = smarts('[C;r5,r6;a]-;!@[C;h0,h1]') + print(q) # canonical atom order + + +File Readers +------------ + +SDF / RDF +~~~~~~~~~ + +.. code-block:: python + + from chython import SDFRead, RDFRead + + # Iterate molecules from SDF + with SDFRead('molecules.sdf') as reader: + for mol in reader: + print(mol.name, mol.meta) + + # Iterate reactions from RDF + with RDFRead('reactions.rdf') as reader: + for rxn in reader: + print(rxn) + + # Read all at once + with SDFRead('molecules.sdf') as reader: + mols = reader.read() + + # Read a limited batch + with SDFRead('molecules.sdf') as reader: + first_100 = reader.read(amount=100) + + # Generator: get first record, then read rest + with RDFRead('reactions.rdf') as f: + first = next(f) + rest = f.read() + + # Indexed random access (Unix only) + with SDFRead('molecules.sdf', indexable=True) as reader: + reader.reset_index() + mol = reader[42] + total = len(reader) + +Pathlib supported: + +.. code-block:: python + + from pathlib import Path + + with RDFRead(Path('reactions.rdf')) as r: + rxn = next(r) + +Opened file objects supported (text mode for all formats except MRV): + +.. code-block:: python + + with open('reactions.rdf') as f, RDFRead(f) as r: + rxn = next(r) + + +MRV +~~~ + +MRV files require **binary mode**: + +.. code-block:: python + + from chython import MRVRead + + with MRVRead(open('structures.mrv', 'rb')) as reader: + for mol in reader: + print(mol) + + +Reading from Archives and Network +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Readers accept any file-like object, enabling transparent reading from compressed sources: + +.. code-block:: python + + # gzip + from gzip import open as gzip_open + with gzip_open('data.rdf.gz', 'rt') as f, RDFRead(f) as r: + rxn = next(r) + + # zip + from zipfile import ZipFile + from io import TextIOWrapper + with ZipFile('data.zip') as z, z.open('data.rdf') as c: + with TextIOWrapper(c) as f, RDFRead(f) as r: + rxn = next(r) + + # tar.gz + from tarfile import open as tar_open + with tar_open('data.tar.gz') as t: + c = t.extractfile('data.rdf') + with TextIOWrapper(c) as f, RDFRead(f) as r: + rxn = next(r) + + # URL via requests + from requests import get + from io import StringIO + with StringIO(get('https://example.com/data.rdf').text) as f, RDFRead(f) as r: + rxn = next(r) + + +Other Readers +~~~~~~~~~~~~~ + +All readers share the same API (iteration, ``.read()``, context manager). + +- **SDFRead** - MOL/SDF (V2000, V3000) +- **RDFRead** - RXN/RDF +- **MRVRead** - ChemAxon MRV (requires binary mode) +- **PDBRead** - PDB format (explicit hydrogens only) + +For SMILES, InChI and XYZ, use the string parsers (``smiles()``, ``inchi()``, ``xyz()``) directly. +To process files with one record per line, iterate lines manually: + +.. code-block:: python + + from chython import smiles + + with open('molecules.smi') as f: + for line in f: + mol = smiles(line) + print(mol) + + +Reader Options +~~~~~~~~~~~~~~ + +MDL readers (SDFRead, RDFRead) accept these options: + +.. code-block:: python + + with SDFRead('molecules.sdf', + ignore=True, # try to fix/skip errors (default) + remap=False, # renumber atoms from 1 + ignore_stereo=False, # discard stereochemistry + ignore_bad_isotopes=False, # reset invalid isotopes + calc_cis_trans=False, # recalculate cis/trans from 2D + ) as reader: + for mol in reader: + pass + + +File Writers +------------ + +SDF / RDF +~~~~~~~~~ + +.. code-block:: python + + from chython import SDFWrite, RDFWrite, ESDFWrite, ERDFWrite + + # Write molecules to SDF (V2000) + with SDFWrite('output.sdf') as writer: + writer.write(mol) + + # V3000 extended format + with ESDFWrite('output_v3000.sdf') as writer: + writer.write(mol) + + # Write reactions to RDF (V2000) + with RDFWrite('output.rdf') as writer: + writer.write(rxn) + + # V3000 reactions + with ERDFWrite('output_v3000.rdf') as writer: + writer.write(rxn) + + # Append mode + with SDFWrite('output.sdf', append=True) as writer: + writer.write(mol) + + # Write with 3D coordinates (conformer index) + with SDFWrite('output_3d.sdf') as writer: + writer.write(mol, write3d=0) + + # Ongoing writing without context manager + f = RDFWrite('output.rdf') + for rxn in data: + f.write(rxn) + f.close() + + +MRV +~~~ + +.. code-block:: python + + from chython import MRVWrite + + with MRVWrite('output.mrv') as writer: + writer.write(mol) + + +SMILES Strings +~~~~~~~~~~~~~~ + +.. code-block:: python + + from chython import smiles + + mol = smiles('CCO') + + # Canonical SMILES + s = str(mol) # or format(mol) + + # Format specifiers + format(mol, 'm') # include atom mapping numbers + format(mol, 'h') # show implicit hydrogens + format(mol, 'r') # random SMILES (non-canonical) + format(mol, 'a') # asymmetric closures + format(mol, 'A') # aromatic bonds (: notation) instead of lowercase atoms + format(mol, '!s') # without stereo + format(mol, '!x') # without CXSMILES extensions + format(mol, '!z') # without charges + format(mol, '!b') # without bond tokens + format(mol, 'mh') # combine multiple: mapping + hydrogens + format(mol, 'h!b') # implicit H, no bond tokens + + # Works with f-strings, %-formatting, .format() + print(f'{mol:A}') + print('smiles: %s' % mol) + + +Serialization +------------- + +Pickle +~~~~~~ + +Full pickle support for all containers. Faster than file formats for temporary storage: + +.. code-block:: python + + from pickle import loads, dumps + + data = dumps(mol) + mol = loads(data) + + # Works for reactions too + data = dumps(rxn) + rxn = loads(data) + + +Chython Binary Pack +~~~~~~~~~~~~~~~~~~~ + +Compact binary format. Stores 2D coordinates, stereo, charges, isotopes, radicals, atom numbers. +Size ~1.5-2x larger than SMILES. Parsing faster than pickle. + +.. code-block:: python + + from chython import MoleculeContainer, ReactionContainer, unpack + + # Pack to bytes (zlib compressed by default) + data = mol.pack() + data = bytes(mol) # same as pack() + + # Unpack (auto-detects molecule or reaction) + restored = unpack(data) + + # Or unpack with explicit type + restored = MoleculeContainer.unpack(data) + + # Uncompressed + data = mol.pack(compressed=False) + restored = unpack(data, compressed=False) + + # Peek at atom count without unpacking + count = MoleculeContainer.pack_len(data, compressed=False) + + # Reactions + rxn_data = rxn.pack() + rxn_restored = ReactionContainer.unpack(rxn_data) + + +Metadata +-------- + +SDF/RDF files store metadata in molecule and reaction objects: + +.. code-block:: python + + rxn = next(RDFRead('reactions.rdf')) + rxn.meta # dict of DTYPE/DATUM fields + rxn.name # reaction title from RDF + + mol = rxn.reactants[0] + mol.name # molecule title from MOL block + mol.meta # molecule metadata dict + + # Set metadata for writing + mol.name = 'Ethanol' + mol.meta['boiling_point'] = '78.37' \ No newline at end of file diff --git a/doc/molecule.rst b/doc/molecule.rst new file mode 100644 index 00000000..e62d4644 --- /dev/null +++ b/doc/molecule.rst @@ -0,0 +1,283 @@ +Molecules +========= + +Properties, atom/bond access, building, and stereochemistry. + + +Molecular Formula and Mass +-------------------------- + +.. code-block:: python + + from chython import smiles + + mol = smiles('CC(=O)Oc1ccccc1C(=O)O') # aspirin + + mol.brutto # {'C': 9, 'H': 8, 'O': 4} + mol.brutto_formula # 'C9H8O4' + mol.brutto_formula_html # 'C9H8O4' + mol.molecular_mass # 180.159... (average atomic masses) + mol.molecular_charge # 0 (total formal charge) + + +Drug-Likeness Descriptors +------------------------- + +.. code-block:: python + + mol = smiles('CC(=O)Oc1ccccc1C(=O)O') + + mol.hydrogen_bond_donors_count # N/O/S with H + mol.hydrogen_bond_acceptors_count # O/N/S with lone pairs + mol.rotatable_bonds_count # non-ring single bonds (excludes amide-like) + mol.carbon_count + mol.carbon_sp3_count + mol.carbon_sp3_fraction # sp3 carbons / total carbons + + +Ring Properties +--------------- + +.. code-block:: python + + mol = smiles('c1ccc2ccccc2c1') # naphthalene + + mol.sssr # list of smallest rings as tuples of atom numbers + mol.rings_count # number of SSSR rings + mol.atoms_rings # dict: atom_number -> list of ring tuples + mol.atoms_rings_sizes # dict: atom_number -> set of ring sizes + mol.aromatic_rings # tuple of aromatic ring atom tuples + + +Other Properties +---------------- + +.. code-block:: python + + mol.atoms_count # heavy atoms only + mol.bonds_count + mol.is_radical # True if any atom is a radical + + +Iterating Atoms and Bonds +-------------------------- + +.. code-block:: python + + mol = smiles('CCO') + + # Iterate atom numbers + for n in mol: + print(n) + + # Iterate (atom_number, atom_object) pairs + for n, atom in mol.atoms(): + print(n, atom.atomic_symbol, atom.atomic_number) + + # Iterate (n, m, bond) triples + for n, m, bond in mol.bonds(): + print(n, m, int(bond)) # int(bond) = bond order: 1,2,3,4(aromatic) + + # Connected components + components = mol.connected_components # list of sets of atom numbers + + +Single Atom / Bond Access +-------------------------- + +.. code-block:: python + + atom = mol.atom(1) # get atom by number + bond = mol.bond(1, 2) # get bond between atoms 1 and 2 + + mol.has_atom(1) # True/False + mol.has_bond(1, 2) # True/False + + +Atom Properties +--------------- + +.. code-block:: python + + atom = mol.atom(1) + + atom.atomic_symbol # 'C', 'N', 'O', etc. + atom.atomic_number # 6, 7, 8, etc. + atom.atomic_mass # average atomic mass (float) + atom.isotope # isotope number or None + atom.charge # formal charge (int) + atom.is_radical # bool + atom.implicit_hydrogens # count of implicit H (int or None) + atom.explicit_hydrogens # count of explicit H neighbors + atom.neighbors # count of non-H neighbors + atom.hybridization # 1=sp3, 2=sp2, 3=sp, 4=aromatic + atom.heteroatoms # count of non-C, non-H neighbors + atom.ring_sizes # set of ring sizes containing this atom + atom.x, atom.y # 2D coordinates + atom.xy # Vector(x, y) - supports tuple unpacking + + +Atom Neighbors / Environment +----------------------------- + +.. code-block:: python + + # Full environment: (neighbor_num, bond, neighbor_atom) + for n, bond, neighbor in mol.environment(atom_num): + print(n, int(bond), neighbor.atomic_symbol) + + # Just neighbor numbers + for n in mol.environment(atom_num, include_bond=False, include_atom=False): + print(n) + + # (neighbor_num, bond) pairs + for n, bond in mol.environment(atom_num, include_atom=False): + print(n, int(bond)) + + +Adjacency Matrix +----------------- + +.. code-block:: python + + import numpy as np + + adj = mol.adjacency_matrix() # 0/1 matrix + adj = mol.adjacency_matrix(True) # bond orders as values + + +Building Molecules +------------------ + +.. code-block:: python + + from chython import MoleculeContainer + + mol = MoleculeContainer() + + # Add atoms (returns atom number) + n1 = mol.add_atom('C') # from symbol + n2 = mol.add_atom('C') + n3 = mol.add_atom(8) # from atomic number (oxygen) + + # Add bonds (bond order: 1=single, 2=double, 3=triple, 4=aromatic) + mol.add_bond(n1, n2, 1) + mol.add_bond(n2, n3, 2) + + print(str(mol)) # CC=O + + # Assign specific atom numbers + mol = MoleculeContainer() + mol.add_atom('C', n=10) # atom number 10 + mol.add_atom('O', n=20) + mol.add_bond(10, 20, 1) + + # Delete atom/bond + mol.delete_bond(n2, n3) + mol.delete_atom(n3) + + # Batch modifications (defer recalculation for performance) + n4 = mol.add_atom('N', _skip_calculation=True) + mol.add_bond(n2, n4, 1, _skip_calculation=True) + mol.fix_structure() # recalculate everything once + + +Merging and Splitting +--------------------- + +.. code-block:: python + + from chython import smiles + + # Split disconnected components + anion, cation = smiles('[Cl-].[Na+]').split() + print(anion, cation) + + # Merge molecules (union) + salt = anion | cation + salt = anion.union(cation, remap=True) # fix atom number overlap + + # Extract substructure by atom numbers + toluene = smiles('Cc1ccccc1') + ring = toluene.substructure([2, 3, 4, 5, 6, 7]) + + # Substructure with neighbors (1 bond deep) + aug = toluene.augmented_substructure([2], deep=1) + + # Remap atom numbers (in-place; use copy() first to keep original) + mol_copy = toluene.copy() + mol_copy.remap({1: 10, 2: 20}) + + # Copy + mol_copy = mol.copy() + + +Stereochemistry +--------------- + +Inspecting +~~~~~~~~~~ + +.. code-block:: python + + mol = smiles('C/C=C/C') # trans-2-butene + + mol.stereogenic_tetrahedrons # dict: atom -> neighbors tuple + mol.stereogenic_allenes # dict: atom -> neighbors tuple + mol.stereogenic_cis_trans # dict: (n, m) bond -> substituents tuple + mol.chiral_tetrahedrons # set of atoms with assigned tetrahedral stereo + mol.chiral_cis_trans # set of bonds with assigned cis/trans stereo + + +Setting +~~~~~~~ + +.. code-block:: python + + mol = smiles('CC(O)F') + + # Add tetrahedral stereo + # env = neighbor atom numbers defining chirality order + # mark = True (counterclockwise / S) or False (clockwise / R) + mol.add_atom_stereo(n=2, env=(1, 3, 4), mark=True) + + # Add cis/trans stereo to double bond + # n, m = double bond atoms; n1, n2 = substituents + # mark = True (cis) or False (trans) + mol.add_cis_trans_stereo(n=2, m=3, n1=1, n2=4, mark=False) + + # Auto-detect cis/trans from 2D coordinates + mol.calculate_cis_trans_from_2d() + + # Wedge/hash bond indicators + mol.add_wedge(n=1, m=2, mark=1) # 1 = wedge, -1 = hash + + # Clear all stereo + mol.clean_stereo() + + # Recalculate stereo from current state + mol.fix_stereo() + + +Hashing and Comparison +----------------------- + +Molecules are hashable and comparable via canonical SMILES: + +.. code-block:: python + + mol1 = smiles('CCO') + mol2 = smiles('OCC') + + mol1 == mol2 # True (same canonical SMILES) + hash(mol1) == hash(mol2) # True + + # Use in sets and dicts + unique = {smiles('CCO'), smiles('OCC'), smiles('c1ccccc1')} + len(unique) # 2 + + # Cryptographic hash (SHA-512 based) + sig = bytes(mol1) + +**Warning**: Avoid modifying molecules (standardize, aromatize, add/remove atoms) after +placing them in sets or dicts. The hash will change and lookups will break. \ No newline at end of file diff --git a/doc/periodictable.rst b/doc/periodictable.rst deleted file mode 100644 index f4ddda46..00000000 --- a/doc/periodictable.rst +++ /dev/null @@ -1,42 +0,0 @@ -chython\.periodictable package -============================== - -Contains classes of all elements used in containers for `MoleculeContainer`, `Query`-prefixed for `QueryContainer`. -Also available 3 special query atom types: `AnyElement`, `AnyMetal` and `ListElement`. - -Below only carbon atom classes shown. - -.. autoclass:: chython.periodictable.C - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - :special-members: __hash__, __eq__, __int__ - -.. autoclass:: chython.periodictable.QueryC - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - :special-members: __hash__, __eq__, __int__ - -.. autoclass:: chython.periodictable.AnyElement - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - :special-members: __hash__, __eq__, __int__ - -.. autoclass:: chython.periodictable.ListElement - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - :special-members: __hash__, __eq__, __int__ - -.. autoclass:: chython.periodictable.AnyMetal - :members: - :undoc-members: - :show-inheritance: - :inherited-members: - :special-members: __hash__, __eq__, __int__ diff --git a/doc/reactions.rst b/doc/reactions.rst new file mode 100644 index 00000000..4c9bdf7c --- /dev/null +++ b/doc/reactions.rst @@ -0,0 +1,205 @@ +Reactions & Templates +===================== + +Parsing, CGR, atom-atom mapping, reaction templates, and deprotection. + + +Parsing Reactions +----------------- + +.. code-block:: python + + from chython import smiles + + rxn = smiles('[CH3:1][OH:2]>>[CH3:1][NH2:3]') + + # Access components + rxn.reactants # tuple of MoleculeContainers + rxn.products # tuple of MoleculeContainers + rxn.reagents # tuple of MoleculeContainers + + # Iterate all molecules + for mol in rxn.molecules(): + print(str(mol)) + + # Metadata + rxn.name = 'Amination' + rxn.meta['temperature'] = '100' + + +Reaction Signatures +------------------- + +Canonical reaction SMILES (SMIRKS) with molecules sorted in canonical order: + +.. code-block:: python + + str(rxn) # canonical reaction SMILES + format(rxn, 'm') # with atom mapping + + # Format specifiers (same as molecule, plus extras) + format(rxn, '!c') # keep original molecule order (no sorting) + format(rxn, '!C') # skip CXSMILES fragment contraction + +Reactions are hashable and comparable, same as molecules: + +.. code-block:: python + + rxn1 == rxn2 # True if same canonical signature + {rxn1, rxn2} # set deduplication + + +Reaction Standardization +------------------------ + +ReactionContainer has the same standardization methods as molecules. +They are applied to all molecules in the reaction: + +.. code-block:: python + + rxn.canonicalize() + rxn.standardize() + rxn.kekule() + rxn.thiele() + rxn.neutralize() + rxn.explicify_hydrogens() # tries to preserve atom-atom mapping + rxn.implicify_hydrogens() + +Reaction-specific methods: + +.. code-block:: python + + # Move unchanged reactants to reagents (based on atom-atom mapping) + rxn.remove_reagents(keep_reagents=True) + + # Merge ions into single multicomponent molecules + rxn.contract_ions() + +Example workflow: + +.. code-block:: python + + rxn = smiles('[Na+:1].[OH-:2].[CH3:7][O:5][C:4]([CH3:3])=[O:6]>>[CH3:3][C:4]([OH:8])=[O:6]') + rxn.contract_ions() # merge Na+ and OH- into NaOH + rxn.remove_reagents(keep_reagents=True) # NaOH/MeOH become reagents + + +Condensed Graph of Reaction (CGR) +---------------------------------- + +CGR overlays reactant and product graphs, showing bond changes: + +.. code-block:: python + + rxn = smiles('[CH3:1][OH:2]>>[CH3:1][NH2:3]') + + # Compose CGR + cgr = ~rxn # shorthand + cgr = rxn.compose() # same + + # From two mapped molecules + mol_r = smiles('[CH3:1][OH:2]') + mol_p = smiles('[CH3:1][NH2:3]') + cgr = mol_r.compose(mol_p) + + # Reaction center + center = cgr.center_atoms # tuple of atom numbers where bonds change + + # Extract center substructure + center_cgr = cgr.substructure(center) + aug_cgr = cgr.augmented_substructure(center, deep=1) + + # CGR supports isomorphism search + for mapping in cgr_query.get_mapping(cgr): + print(mapping) + + +Atom-Atom Mapping +----------------- + +Neural attention-based mapping (requires ``chytorch-rxnmap`` package): + +.. code-block:: python + + import chython + chython.torch_device = 'cpu' # set before first use; 'cuda:0' for GPU + + rxn = smiles('CCO.CC(=O)O>>CCOC(=O)C.O') + rxn.reset_mapping() + + # Rule-based fix for known mapping mistakes + rxn.fix_mapping() + log = rxn.fix_mapping(logging=True) + +``reset_mapping`` loads the neural model once on first call. +To use GPU, set ``chython.torch_device`` before the first call. +For multiprocessing, call ``reset_mapping`` only inside workers to avoid a single-GPU bottleneck. + + +Reactor (Multi-Reactant Templates) +------------------------------------ + +Reactor applies SMARTS-pattern transformations to one or more reactant molecules. +Atom numbers in query patterns and product templates must be mapped to each other. + +.. code-block:: python + + from chython import smarts, smiles, Reactor + + # Define patterns using mapped SMARTS + acid = smarts('[C:1]([O;D1:2])=[O:3]') # carboxylic acid + alco = smarts('[C:4][O;D1:5]') # alcohol + + # Product template (reuses atom mapping numbers from patterns) + ester = smarts('[C:1](=[O:3])[O:5][C:4]') + + # Create reactor (patterns and products are tuples) + reactor = Reactor((acid, alco), (ester,), + delete_atoms=True, # remove atoms not in product + one_shot=False) # apply multiple times if possible + + # Apply to reactants (pass as positional args, not a list) + acid_mol = smiles('CC(=O)O') + alco_mol = smiles('CCO') + for product in reactor(acid_mol, alco_mol): + print(str(product)) + +Use ``one_shot=True`` to apply the template only once (first match). + + +Transformer (Single Molecule) +------------------------------ + +Transformer applies a pattern replacement to a single molecule: + +.. code-block:: python + + from chython import smarts, smiles, Transformer + + pattern = smarts('[C:1]=[O:2]') + replacement = smarts('[C:1][O:2]') + + t = Transformer(pattern, replacement) + + mol = smiles('CC=O') + for result in t(mol): + print(str(result)) + + +Deprotection +------------ + +Built-in templates for ~50+ protective group removals: + +.. code-block:: python + + from chython.reactor.deprotection import apply_all, hydroxyl_benzyl, amine_boc + + mol = smiles('...') # protected molecule + + # Remove specific protection group + for result in hydroxyl_benzyl(mol): + print(str(result)) + + # Remove all known protection groups iteratively + result = apply_all(mol) diff --git a/doc/reactor.rst b/doc/reactor.rst deleted file mode 100644 index 4c3161a9..00000000 --- a/doc/reactor.rst +++ /dev/null @@ -1,7 +0,0 @@ -chython\.reactor package -======================== - -.. automodule:: chython.reactor - :members: - :undoc-members: - :inherited-members: diff --git a/doc/standardize.rst b/doc/standardize.rst new file mode 100644 index 00000000..a2cc36e2 --- /dev/null +++ b/doc/standardize.rst @@ -0,0 +1,179 @@ +Standardization +=============== + +Canonicalization, functional group normalization, aromaticity, tautomers, and deduplication. + + +Canonicalize +------------ + +``canonicalize()`` applies the full normalization pipeline: neutralize, standardize functional groups, +Kekule normalization, implicit hydrogen cleanup, aromatization, and charge standardization. + +.. code-block:: python + + from chython import smiles + + mol = smiles('C(=O)(O)c1ccccc1') + + mol.canonicalize() + print(str(mol)) # canonical SMILES + + # Options + mol.canonicalize( + fix_tautomers=True, # canonical tautomer form (default) + keep_kekule=False, # return Kekule instead of aromatic + logging=False, # return list of changes made + ignore=True, # skip standardization bugs + ) + + # With logging: returns list of (atoms, rule_id, description) tuples + log = mol.canonicalize(logging=True) + for atoms, rule_id, description in log: + print(f'{description} at atoms {atoms}') + + +Functional Group Standardization +--------------------------------- + +``standardize()`` normalizes functional groups (nitro, sulfoxide, etc.) +without changing aromaticity or tautomers. Over 80 rules applied: + +.. code-block:: python + + mol = smiles('c1ccccc1N(=O)=O') # nitro + mol.standardize() # normalizes to [N+]([O-])=O form + + # With logging + log = mol.standardize(logging=True) + + # Charge normalization (zwitterions) + mol.standardize_charges() + + +Neutralize +---------- + +.. code-block:: python + + mol = smiles('[NH3+]CC(=O)[O-]') # zwitterion + mol.neutralize() # removes zwitterionic charges + + +Aromaticity +----------- + +.. code-block:: python + + mol = smiles('c1ccccc1') + + # Convert aromatic (Thiele) to Kekule form + mol.kekule() + print(str(mol)) # C1=CC=CC=C1 + + # Convert back to aromatic form + mol.thiele() + print(str(mol)) # c1ccccc1 + + # Enumerate all Kekule structures + for kekule_form in mol.enumerate_kekule(): + print(str(kekule_form)) + + +Implicit / Explicit Hydrogens +------------------------------ + +.. code-block:: python + + mol = smiles('CCO') + + # Add explicit hydrogens + added = mol.explicify_hydrogens() # returns count of added H + mol.clean2d() # recalculate layout after adding atoms + + # Remove explicit hydrogens (make implicit) + mol.implicify_hydrogens() + + # Fix implicit hydrogen counts + mol.fix_structure() + +``implicify_hydrogens`` works for aromatic rings only in Kekule form. +``explicify_hydrogens`` for aromatized forms requires ``kekule()`` then optionally ``thiele()`` afterward. + + +Tautomers +--------- + +.. code-block:: python + + mol = smiles('Oc1ccncc1') # 4-pyridinol + + # Enumerate tautomers + for tautomer in mol.enumerate_tautomers(limit=100): + print(str(tautomer)) + + # Include charge-shifted forms + for tautomer in mol.enumerate_charged_tautomers(limit=100): + print(str(tautomer)) + + +Valence Checking +---------------- + +.. code-block:: python + + mol = smiles('C=N=Cc1ccccc1') + + # Check for valence problems (returns list of atom numbers with issues) + errors = mol.check_valence() + print('errors:', errors) + + # Aromatic rings must be kekulized first for accurate checking + mol.canonicalize() + errors = mol.check_valence() + + +Deduplication +------------- + +Using Sets +~~~~~~~~~~ + +Molecules are hashable (based on canonical SMILES), so sets remove duplicates: + +.. code-block:: python + + mols = [smiles('CCO'), smiles('OCC'), smiles('C(O)C')] + + for m in mols: + m.canonicalize() + + unique = set(mols) # 1 molecule (all three are ethanol) + + +Using Canonical SMILES +~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: python + + seen = set() + unique = [] + for mol in mols: + mol.canonicalize() + s = str(mol) + if s not in seen: + seen.add(s) + unique.append(mol) + + +Graph Equality +~~~~~~~~~~~~~~ + +``is_equal()`` compares molecular graphs including all atom/bond properties: + +.. code-block:: python + + mol1 = smiles('CCO') + mol2 = smiles('OCC') + + mol1.is_equal(mol2) # True diff --git a/doc/substructure.rst b/doc/substructure.rst new file mode 100644 index 00000000..aaba4483 --- /dev/null +++ b/doc/substructure.rst @@ -0,0 +1,321 @@ +Substructure Search & Fingerprints +=================================== + +Isomorphism, SMARTS queries, query building, and molecular fingerprints. + + +Substructure Check +------------------ + +.. code-block:: python + + from chython import smiles + + benzene = smiles('c1ccccc1') + toluene = smiles('Cc1ccccc1') + + # Operator-based + benzene < toluene # True: benzene is substructure of toluene + benzene <= toluene # True: substructure or equal + benzene < benzene # False: not strict substructure of itself + benzene <= benzene # True: equal + + # Method-based + benzene.is_substructure(toluene) # True + benzene.is_equal(toluene) # False + + +Enumerating Matches +------------------- + +``get_mapping()`` yields all substructure mappings as dicts ``{query_atom: target_atom}``: + +.. code-block:: python + + query = smiles('CC') + target = smiles('CCC') + + # First match + mapping = next(query.get_mapping(target)) + + # All matches (automorphism_filter=True by default skips symmetric duplicates) + for mapping in query.get_mapping(target): + print(mapping) + + # All symmetry-equivalent matches + for mapping in query.get_mapping(target, automorphism_filter=False): + print(mapping) + + # Restrict search to specific atoms + for mapping in query.get_mapping(target, searching_scope=[1, 2]): + print(mapping) + + +SMARTS Queries +-------------- + +``smarts()`` returns a ``QueryContainer`` with pattern-matching semantics: + +.. code-block:: python + + from chython import smarts, smiles + + # Carbonyl + query = smarts('[C]=[O]') + query <= smiles('CC(=O)O') # True + + for mapping in query.get_mapping(smiles('CC(=O)O')): + print(mapping) + + # Aromatic nitrogen + smarts('[N;a]') <= smiles('c1ccncc1') # True + + # Element containment shortcut + 'N' in smiles('c1ccncc1') # True + 'Br' in smiles('c1ccncc1') # False + + +SMARTS Language +--------------- + +Chython's SMARTS differs from RDKit/OpenBabel in several ways. + +Atom Primitives +~~~~~~~~~~~~~~~ + +Standard: + +- ``#N`` - atomic number (``#6`` for carbon) +- ``D`` - degree / neighbor count (``D3``) +- ``h`` - implicit hydrogen count (``h1``) +- ``r`` - ring size membership (``r5``, ``r6``) +- ``!R`` - acyclic (not in any ring) +- ``a`` - aromatic +- ``A`` - any element (wildcard) +- Charge: ``+``, ``-``, ``+2``, ``-3`` +- Isotope: ``[14C]`` +- Stereo: ``@``, ``@@`` + +Chython extensions: + +- ``z`` - hybridization: ``z1`` = sp3, ``z2`` = sp2, ``z3`` = sp, ``z4`` = aromatic +- ``x`` - heteroatom neighbor count: ``x0`` = none, ``x2`` = two +- ``M`` - any metal (d-element) + +.. code-block:: python + + smarts('[C;z2;x0]') # sp2 carbon, no heteroatom neighbors + smarts('[O;D1;z1;x0][C;D3;x2;z2]=O') # carboxylic acid + smarts('[M]') # any metal + +NOT Supported +~~~~~~~~~~~~~ + +- Recursive SMARTS ``$(...)`` +- Valence ``v`` +- Total connectivity ``X`` +- Ring count ``R`` without size (use ``r`` with explicit sizes or ``!R``) +- Implicit AND ``&`` (use ``;`` instead) + +Logical Operators +~~~~~~~~~~~~~~~~~ + +- ``;`` = AND between primitives: ``[C;D3;r6]`` +- ``,`` = OR within same primitive type: ``[r5,r6]``, ``[C,N]`` + +OR cannot mix different primitive types: ``[D1,h1]`` raises an error. + +.. code-block:: python + + smarts('[C;r5,r6;a]') # aromatic C in 5- or 6-membered ring + smarts('[C,N]') # carbon or nitrogen + smarts('[C;!R]') # acyclic carbon + +Bond Queries +~~~~~~~~~~~~ + +- ``-`` single, ``=`` double, ``#`` triple, ``:`` aromatic, ``~`` any +- OR: ``-,=`` (single or double) +- Negation: ``!-`` (not single), ``!:`` (not aromatic) +- Ring bonds: ``-;@`` (single and in ring), ``-;!@`` (single and not in ring) + +Ring bond modifiers (``@``, ``!@``) require an explicit bond order prefix. + +.. code-block:: python + + smarts('[C]-;!@[C]') # non-ring single bond between two carbons + smarts('[C]-;@[C]') # single bond in a ring + smarts('[C]-,=[C]') # single or double bond + +CXSMARTS Extensions +~~~~~~~~~~~~~~~~~~~ + +Radicals via CXSMARTS notation. Hybridization and heteroatom count use +chython's ``z`` and ``x`` primitives instead of CXSMARTS ``atomProp``: + +.. code-block:: python + + # Radical on atom at position 1 + q = smarts('[C] |^1:0|') + + # Aromatic C in 5/6-ring, non-ring single bond to radical C + # with 0-1 hydrogens — use z (hybridization) and x (heteroatoms) primitives + q = smarts('[C;r5,r6;a]-;!@[C;h0,h1;z1,z2;x0] |^1:1|') + + +Query Building API +------------------ + +The recommended way to build queries is via SMARTS strings: + +.. code-block:: python + + from chython import smarts, smiles + + # Acyclic thia/oxa carbonyl with 3 neighbors + q = smarts('[C;D3;z2;!R;h0]=[O,S]') + q < smiles('CC(=O)O') # True (acid) + q < smiles('CC(=S)C') # True (thioketone) + q < smiles('CC=O') # False (aldehyde - only 2 neighbors) + +For queries that cannot be expressed in SMARTS, build programmatically. +Add atoms first, then set query constraints on the atom objects: + +.. code-block:: python + + from chython import QueryContainer, smiles + from chython.containers.bonds import QueryBond + from chython.periodictable import ListElement + + q = QueryContainer('') + q.add_atom('C') + a = q.atom(1) + a.neighbors = 3 + a.hybridization = 2 + a.ring_sizes = 0 + a.implicit_hydrogens = 0 + + q.add_atom(ListElement(['O', 'S']), n=3) + q.add_bond(1, 3, 2) + + q < smiles('CC(=O)O') # True (acid) + q < smiles('CC(=S)C') # True (thioketone) + + # Ring-ring linker using QueryBond(order, in_ring) + q = QueryContainer('') + q.add_atom('C') + q.add_atom('C') + q.atom(1).ring_sizes = 6 + q.atom(1).hybridization = 4 + q.atom(2).ring_sizes = 6 + q.atom(2).hybridization = 4 + q.add_bond(1, 2, QueryBond(1, False)) # single bond, NOT in ring + + q < smiles('c1ccc(cc1)-c1ccccc1') # True (biphenyl) + q < smiles('C1CC(=O)CC1') # False + +Query atom constraint values can be single integers or tuples for OR logic: + +.. code-block:: python + + a = q.atom(1) + a.neighbors = (2, 3) # 2 OR 3 neighbors + a.hybridization = 4 # aromatic only + a.ring_sizes = (5, 6) # in 5- or 6-membered ring + a.implicit_hydrogens = 0 # no implicit H + a.heteroatoms = 0 # no heteroatom neighbors + # ring_sizes = 0 means "not in any ring" + + +Automorphism +------------ + +.. code-block:: python + + mol = smiles('c1ccccc1') + + mol.is_automorphic() # True for benzene + + for mapping in mol.get_automorphism_mapping(): + print(mapping) + + +Maximum Common Substructure (MCS) +---------------------------------- + +.. code-block:: python + + mol1 = smiles('c1ccccc1O') # phenol + mol2 = smiles('c1ccccc1N') # aniline + + for mapping in mol1.get_mcs_mapping(mol2, limit=10000): + print(mapping) # {mol1_atom: mol2_atom} + break # first = largest MCS + + +Morgan Fingerprints +------------------- + +Similar to ECFP / RDKit Morgan fingerprints: + +.. code-block:: python + + from chython import smiles + import numpy as np + + mol = smiles('c1ccccc1O') + + # Binary fingerprint as numpy array (shape: (1024,)) + fp = mol.morgan_fingerprint( + min_radius=1, + max_radius=4, + length=1024, + number_active_bits=2, + ) + + # Bit indices only (more memory efficient) + bits = mol.morgan_bit_set(min_radius=1, max_radius=4, length=1024) + + # Raw hashes (no folding) - useful for exact fragment matching + hashes = mol.morgan_hash_set(min_radius=1, max_radius=4) + + +Linear Fingerprints +------------------- + +Based on linear path fragments (similar to RDKit RDKFingerprint): + +.. code-block:: python + + fp = mol.linear_fingerprint( + min_radius=1, + max_radius=4, + length=1024, + number_active_bits=2, + number_bit_pairs=4, # count-sensitive bits + ) + + bits = mol.linear_bit_set(min_radius=1, max_radius=4, length=1024) + hashes = mol.linear_hash_set(min_radius=1, max_radius=4) + + +Tanimoto Similarity +------------------- + +.. code-block:: python + + import numpy as np + + mol1 = smiles('c1ccccc1O') + mol2 = smiles('c1ccccc1N') + + fp1 = mol1.morgan_fingerprint() + fp2 = mol2.morgan_fingerprint() + + # Via numpy + tanimoto = np.dot(fp1, fp2) / (fp1.sum() + fp2.sum() - np.dot(fp1, fp2)) + + # Via bit sets (faster) + bits1 = mol1.morgan_bit_set() + bits2 = mol2.morgan_bit_set() + tanimoto = len(bits1 & bits2) / len(bits1 | bits2) diff --git a/doc/tutorial/example.mrv b/doc/tutorial/example.mrv deleted file mode 100644 index 19727fa2..00000000 --- a/doc/tutorial/example.mrv +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/doc/tutorial/example.rdf b/doc/tutorial/example.rdf deleted file mode 100644 index 4cec3888..00000000 --- a/doc/tutorial/example.rdf +++ /dev/null @@ -1,200 +0,0 @@ -$RDFILE 1 -$DATM 01/15/19 16:27 -$RFMT -$RXN -reaction title - Mrv16418 011501191627 - - 3 1 -$MOL -molecule title - Mrv1641801151916272D - - 3 2 0 0 0 0 999 V2000 - -10.8132 0.1375 0.0000 C 0 0 0 0 0 0 0 0 0 3 0 0 - -10.0988 -0.2750 0.0000 C 0 0 0 0 0 0 0 0 0 4 0 0 - -9.3843 0.1375 0.0000 O 0 0 0 0 0 0 0 0 0 1 0 0 - 1 2 1 0 0 0 0 - 2 3 1 0 0 0 0 -M END -$MOL - - Mrv1641801151916272D - - 6 5 0 0 0 0 999 V2000 - -6.2611 0.3572 0.0000 C 0 0 0 0 0 0 0 0 0 5 0 0 - -6.6736 -0.3572 0.0000 C 0 0 0 0 0 0 0 0 0 6 0 0 - -6.6736 1.0717 0.0000 O 0 0 0 0 0 0 0 0 0 7 0 0 - -5.4361 0.3572 0.0000 O 0 0 0 0 0 0 0 0 0 8 0 0 - -6.2611 -1.0717 0.0000 O 0 0 0 0 0 0 0 0 0 9 0 0 - -7.4986 -0.3572 0.0000 O 0 0 0 0 0 0 0 0 0 10 0 0 - 1 2 1 0 0 0 0 - 1 3 2 0 0 0 0 - 1 4 1 0 0 0 0 - 2 5 2 0 0 0 0 - 2 6 1 0 0 0 0 -M END -$MOL - - Mrv1641801151916272D - - 3 2 0 0 0 0 999 V2000 - -2.3645 -0.2750 0.0000 C 0 0 0 0 0 0 0 0 0 11 0 0 - -3.0789 0.1375 0.0000 C 0 0 0 0 0 0 0 0 0 12 0 0 - -1.6500 0.1375 0.0000 O 0 0 0 0 0 0 0 0 0 2 0 0 - 1 2 1 0 0 0 0 - 1 3 1 0 0 0 0 -M END -$MOL - - Mrv1641801151916272D - - 10 9 0 0 0 0 999 V2000 - 6.0400 -0.2063 0.0000 C 0 0 0 0 0 0 0 0 0 5 0 0 - 5.3256 0.2062 0.0000 C 0 0 0 0 0 0 0 0 0 6 0 0 - 6.7545 0.2062 0.0000 O 0 0 0 0 0 0 0 0 0 2 0 0 - 4.6111 -0.2063 0.0000 O 0 0 0 0 0 0 0 0 0 1 0 0 - 5.3256 1.0313 0.0000 O 0 0 0 0 0 0 0 0 0 9 0 0 - 6.0400 -1.0313 0.0000 O 0 0 0 0 0 0 0 0 0 7 0 0 - 7.4690 -0.2063 0.0000 C 0 0 0 0 0 0 0 0 0 11 0 0 - 8.1834 0.2062 0.0000 C 0 0 0 0 0 0 0 0 0 12 0 0 - 3.8966 0.2062 0.0000 C 0 0 0 0 0 0 0 0 0 4 0 0 - 3.1821 -0.2063 0.0000 C 0 0 0 0 0 0 0 0 0 3 0 0 - 1 2 1 0 0 0 0 - 1 3 1 0 0 0 0 - 2 4 1 0 0 0 0 - 2 5 2 0 0 0 0 - 1 6 2 0 0 0 0 - 3 7 1 0 0 0 0 - 7 8 1 0 0 0 0 - 4 9 1 0 0 0 0 - 9 10 1 0 0 0 0 -M END -$DTYPE CdId -$DATUM 1872 -$DTYPE solvent -$DATUM 3 -$DTYPE temperature -$DATUM 129.5 -$DTYPE tabulated_constant -$DATUM -6.87 -$RFMT -$RXN - - Mrv15b30 011501192001 - - 2 1 -$MOL - - Mrv15b3001151920012D - - 3 2 0 0 0 0 999 V2000 - -6.3936 0.3437 0.0000 C 0 0 0 0 0 0 0 0 0 1 0 0 - -5.6791 -0.0688 0.0000 C 0 0 0 0 0 0 0 0 0 2 0 0 - -4.9647 0.3437 0.0000 I 0 0 0 0 0 0 0 0 0 3 0 0 - 1 2 1 0 0 0 0 - 2 3 1 0 0 0 0 -M END -$MOL - - Mrv15b3001151920012D - - 10 10 0 0 0 0 999 V2000 - -2.3645 0.6188 0.0000 C 0 0 0 0 0 0 0 0 0 4 0 0 - -1.6500 0.2062 0.0000 C 0 0 0 0 0 0 0 0 0 5 0 0 - -1.6500 -0.6188 0.0000 C 0 0 0 0 0 0 0 0 0 6 0 0 - -2.3645 -1.0313 0.0000 C 0 0 0 0 0 0 0 0 0 7 0 0 - -3.0789 -0.6187 0.0000 C 0 0 0 0 0 0 0 0 0 8 0 0 - -3.0789 0.2063 0.0000 C 0 0 0 0 0 0 0 0 0 9 0 0 - -2.3645 1.4438 0.0000 N 0 0 0 0 0 0 0 0 0 10 0 0 - -3.0789 1.8563 0.0000 O 0 0 0 0 0 0 0 0 0 11 0 0 - -1.6500 1.8562 0.0000 O 0 0 0 0 0 0 0 0 0 12 0 0 - -2.3645 -1.8563 0.0000 O 0 0 0 0 0 0 0 0 0 13 0 0 - 1 2 1 0 0 0 0 - 1 6 2 0 0 0 0 - 1 7 1 0 0 0 0 - 2 3 2 0 0 0 0 - 3 4 1 0 0 0 0 - 4 5 2 0 0 0 0 - 4 10 1 0 0 0 0 - 5 6 1 0 0 0 0 - 7 8 2 0 0 0 0 - 7 9 2 0 0 0 0 -M END -$MOL - - Mrv15b3001151920012D - - 12 12 0 0 0 0 999 V2000 - 3.8966 -1.2375 0.0000 O 0 0 0 0 0 0 0 0 0 13 0 0 - 3.1821 2.4750 0.0000 O 0 0 0 0 0 0 0 0 0 12 0 0 - 4.6111 2.4750 0.0000 O 0 0 0 0 0 0 0 0 0 11 0 0 - 3.8966 2.0625 0.0000 N 0 0 0 0 0 0 0 0 0 10 0 0 - 4.6111 0.8250 0.0000 C 0 0 0 0 0 0 0 0 0 9 0 0 - 4.6111 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 8 0 0 - 3.8966 -0.4125 0.0000 C 0 0 0 0 0 0 0 0 0 7 0 0 - 3.1821 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 6 0 0 - 3.1821 0.8250 0.0000 C 0 0 0 0 0 0 0 0 0 5 0 0 - 3.8966 1.2375 0.0000 C 0 0 0 0 0 0 0 0 0 4 0 0 - 3.1821 -1.6500 0.0000 C 0 0 0 0 0 0 0 0 0 2 0 0 - 3.1821 -2.4750 0.0000 C 0 0 0 0 0 0 0 0 0 1 0 0 - 7 1 1 0 0 0 0 - 4 2 2 0 0 0 0 - 4 3 2 0 0 0 0 - 10 4 1 0 0 0 0 - 6 5 1 0 0 0 0 - 10 5 2 0 0 0 0 - 7 6 2 0 0 0 0 - 8 7 1 0 0 0 0 - 9 8 2 0 0 0 0 - 10 9 1 0 0 0 0 - 1 11 1 0 0 0 0 - 11 12 1 0 0 0 0 -M END -$RFMT -$RXN - - Mrv15b30 011501192008 - - 2 1 -$MOL - - Mrv15b3001151920082D - - 3 2 0 0 0 0 999 V2000 - -6.9093 0.3161 0.0000 C 0 0 0 0 0 0 0 0 0 1 0 0 - -6.1948 -0.0964 0.0000 C 0 0 0 0 0 0 0 0 0 2 0 0 - -5.4804 0.3161 0.0000 I 0 0 0 0 0 0 0 0 0 3 0 0 - 1 2 1 0 0 0 0 - 2 3 1 0 0 0 0 -M END -$MOL - - Mrv15b3001151920082D - - 4 3 0 0 0 0 999 V2000 - -1.6500 -0.3572 0.0000 Na 0 0 0 0 0 0 0 0 0 0 0 0 - -2.0625 0.3572 0.0000 N 0 5 0 0 0 0 0 0 0 4 0 0 - -2.8875 0.3572 0.0000 N 0 3 0 0 0 0 0 0 0 5 0 0 - -3.7125 0.3572 0.0000 N 0 0 0 0 0 0 0 0 0 6 0 0 - 2 1 1 0 0 0 0 - 3 2 1 0 0 0 0 - 4 3 3 0 0 0 0 -M CHG 2 2 -1 3 1 -M END -$MOL - - Mrv15b3001151920082D - - 5 4 0 0 0 0 999 V2000 - 6.0696 0.3572 0.0000 N 0 5 0 0 0 0 0 0 0 6 0 0 - 5.2446 0.3572 0.0000 N 0 0 0 0 0 0 0 0 0 5 0 0 - 4.4196 0.3572 0.0000 N 0 3 0 0 0 0 0 0 0 4 0 0 - 3.5946 0.3572 0.0000 C 0 0 0 0 0 0 0 0 0 2 0 0 - 3.1821 -0.3572 0.0000 C 0 0 0 0 0 0 0 0 0 1 0 0 - 1 2 2 0 0 0 0 - 2 3 3 0 0 0 0 - 3 4 1 0 0 0 0 - 4 5 1 0 0 0 0 -M CHG 2 1 -1 3 1 -M END diff --git a/doc/tutorial/example.rdf.gz b/doc/tutorial/example.rdf.gz deleted file mode 100644 index c431963a..00000000 Binary files a/doc/tutorial/example.rdf.gz and /dev/null differ diff --git a/doc/tutorial/example.tar.gz b/doc/tutorial/example.tar.gz deleted file mode 100644 index f1ddfee8..00000000 Binary files a/doc/tutorial/example.tar.gz and /dev/null differ diff --git a/doc/tutorial/example.zip b/doc/tutorial/example.zip deleted file mode 100644 index 87810a02..00000000 Binary files a/doc/tutorial/example.zip and /dev/null differ diff --git a/doc/tutorial/notebook.ipynb b/doc/tutorial/notebook.ipynb deleted file mode 100644 index 85ebff4a..00000000 --- a/doc/tutorial/notebook.ipynb +++ /dev/null @@ -1,1622 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "# 1. Input-output operations\n", - "\n", - "*chython.files* subpackage contains file readers and writers classes.\n", - "\n", - "## 1.1. MDL RDF reader\n", - "\n", - "**RDFRead** class can be used for RDF files reading.\n", - "Instance of this class is file-like object which support **iteration**, has a method **read()** for parsing all data and **context manager**.\n", - "\n", - "### 1.1.1. Read file from disk" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "from chython.files import * # import all available readers and writers\n", - "\n", - "with RDFRead('example.rdf') as f:\n", - " first = next(f) # get first reaction using generator\n", - " data = f.read() # read remaining reactions to list of ReactionContainers\n", - "\n", - "data = []\n", - "with RDFRead('example.rdf') as f:\n", - " for r in f: # looping is supported. Useful for large files.\n", - " data.append(r)" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "#### OOP-stype Pathlib supported" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "from pathlib import Path\n", - "\n", - "with RDFRead(Path('example.rdf')) as r: # OOP style call\n", - " r = next(r)" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "#### opened files supported\n", - "RDF file should be opened in text mode" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "with open('example.rdf') as f, RDFRead(f) as r:\n", - " r = next(r) # OOP style application" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "### 1.1.2. Transparent loading from archives and network\n", - "Readers designed transparently support any type of data sources. \n", - "\n", - "Data sources should be file-like objects." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "from requests import get\n", - "from io import StringIO\n", - "\n", - "# get function return requested URL which has attribute text. \n", - "# in example this text is whole RDF stored in single string.\n", - "# RDFread does not support parsing of strings, but one can emulate files with data \n", - "# instead of strings by using io.StringIO\n", - "with StringIO(get('https://github.com/chython/chython/raw/master/doc/tutorial/example.rdf').text) as f, RDFRead(f) as r:\n", - " r = next(r)\n", - "\n", - "# python support gzipped data. This example shows how to work with compressed \n", - "# data directly without decompressing them to disk.\n", - "from gzip import open as gzip_open\n", - "with gzip_open('example.rdf.gz', 'rt') as f, RDFRead(f) as r:\n", - " r = next(r)\n", - "\n", - "# zip-files also supported out of the box \n", - "# zipped files can be opened only in binary mode. io.TextIOWrapper can be used for transparent decoding them into text\n", - "from zipfile import ZipFile\n", - "from io import TextIOWrapper\n", - "with ZipFile('example.zip') as z, z.open('example.rdf') as c:\n", - " with TextIOWrapper(c) as f, RDFRead(f) as r:\n", - " r = next(r)\n", - "\n", - "# tar-file reading example\n", - "from tarfile import open as tar_open\n", - "from io import TextIOWrapper\n", - "with tar_open('example.tar.gz') as t:\n", - " c = t.extractfile('example.rdf')\n", - " with TextIOWrapper(c) as f, RDFRead(f) as r:\n", - " r = next(r)" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## 1.2. Other Readers\n", - "* SDFRead - MOL, SDF files reader (versions v2000, v3000 are supported)\n", - "* MRVRead - ChemAxon MRV files reader (lxml parser is used)\n", - "* SMILESRead - SMILES strings files reader (coho backend used). Every row should start with new SMILES\n", - "* INCHIRead - INCHI strings files reader (INCHI trust backend used). Every row should start with new InChI\n", - "* XYZRead - xyz files reader (only structures with explicit hydrogens supported)\n", - "* PDBRead - PDB files parser (only structures with explicit hydrogens supported)\n", - "\n", - "All files except MRV should be opened in **text-mode** \n", - "MRV requires binary mode `open('/path/to/data.mrv', 'rb')`" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "with MRVRead(open('example.mrv', 'rb')) as f:\n", - " m = next(f)" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## 1.3. File writers\n", - "Export in following file formats is supported:\n", - "\n", - "* RDFWrite (v2000) - molecules and reactions export in RDF format\n", - "* SDFWrite (v2000) - molecules export in SDF format\n", - "* ERDFWrite (v3000) - molecules and reactions export in RDF format\n", - "* ESDFWrite (v3000) - molecules export in SDF format\n", - "* MRVWrite - molecules and reactions export in MRV format\n", - "\n", - "Writers have the same API as readers. All writers work with text-files\n", - "Writers have `write` method which accepts as argument single reaction or molecule object" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "with RDFWrite('out.rdf') as f: # context manager supported\n", - " for r in data:\n", - " f.write(r)\n", - "# file out.rdf will be overriden" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "f = RDFWrite('out.rdf') # ongoing writing into a single file\n", - "for r in data:\n", - " f.write(r)\n", - "\n", - "f.write(r)\n", - "f.close() # close file. Flushes Python writer buffers." - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## 1.4. Pickle support\n", - "\n", - "Chython containers fully support pickle dumping and loading.\n", - "\n", - "Pickle dumps are more fast than common files and could be used as temporal storage." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "from pickle import loads, dumps" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "loads(dumps(r)) # load reaction from Pickle dump" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## 1.5. Chython binary format (chython pack)\n", - "\n", - "Chython introduce new effective format for molecules and reactions, which combine benefits from MDL and SMILES formats.\n", - "Molecules store 2d-coordinates; tetrahedron, allene and cis-trans stereo; explicit bonds, implicit hydrogen count, atom numbers, radical mark, charge, isotope.\n", - "\n", - "Size only 1.5-2 times larger than SMILES. Parsing speed is faster than pickle.\n", - "Full specification described in source code." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "from chython import MoleculeContainer, ReactionContainer\n", - "\n", - "b = r.pack()\n", - "r = ReactionContainer.unpack(b)\n", - "\n", - "# same for molecules\n", - "# MoleculeContainer.unpack(m.pack())" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## 1.6. Metadata access\n", - "\n", - "RDF, SDF, etc - files have metadata which stored in molecules and reactions objects" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "r = next(RDFRead('example.rdf'))\n", - "r.meta # dictionary for molecule/reaction properties storage. For example, DTYPE/DATUM fields of RDF file are read into this dictionary" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "r.name # string with reaction title from RDF" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "r.reactants[0].name # string with reactant molecule title from MOL" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## 1.7. Depiction into SVG" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "pycharm": { - "name": "#%%\n" - }, - "scrolled": true - }, - "source": [ - "r.depict()[:100] # show only part of string" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "r # Notebooks supported!" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "source": [ - "from chython import depict_settings\n", - "\n", - "depict_settings(aam=False) # configure depiction" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "r.flush_cache() # drop cached depiction\n", - "r" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "depict_settings() # restore defaults" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## 1.8. String parsers" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "from chython import smiles, smarts, mdl_mol, xyz, inchi" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "m = smiles('CCO')\n", - "m.clean2d()\n", - "m" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "m = inchi('InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3')\n", - "m.clean2d()\n", - "m" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "m = mdl_mol('''\n", - " Mrv2115 04202210182D \n", - "\n", - " 3 2 0 0 0 0 999 V2000\n", - " 1.2375 -0.7145 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n", - " 1.9520 -1.1270 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n", - " 2.6664 -0.7145 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0\n", - " 1 2 1 0 0 0 0\n", - " 2 3 1 0 0 0 0\n", - "M END''')\n", - "m" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "m = xyz((('O', 0., 0., 0.), ('H', 1., 0., 0.), ('H', 0., 1., 0.)))\n", - "m.clean2d()\n", - "m" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "### 1.8.1. SMARTS\n", - "\n", - "Only limited features list supported.\n", - "\n", - "* stereo ignored.\n", - "* only D, a, h, r and !R atom primitives supported.\n", - "* bond order list and not bond supported.\n", - "* [not]ring bond supported only in combination with explicit bonds, not bonds and bonds orders lists.\n", - "* mapping, charge and isotopes supported.\n", - "* list of elements supported.\n", - "* A - treats as any element. A-primitive (aliphatic) ignored.\n", - "* M - treats as any metal..\n", - "* &-logic operator unsupported.\n", - "* ;-logic operator is mandatory except for charge, isotope, stereo marks. however preferable.\n", - "* CXSMARTS radicals supported.\n", - "* hybridization and heteroatoms count in CXSMARTS atomProp notation as and keys supported.\n", - "\n", - "For example::\n", - "\n", - "`[C;r5,r6;a]-;!@[C;h0,h1] |^1:1,atomProp:1.hyb.32:1.het.0|` - aromatic C member of 5 or 6 atoms ring connected with non-ring single bond to SP3 or SP2 radical C with 0 or 1 hydrogen and no heteroatom neighbors." - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "q = smarts('[C;r5,r6;a]-;!@[C;h0,h1] |^1:1,atomProp:1.hyb.32:1.het.0|')\n", - "print(q) # canonic atoms order!" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "# 2. Signatures and duplicates selection\n", - "\n", - "## 2.1. Molecule Signatures\n", - "*MoleculeContainer* has methods for unique molecule signature generation.\n", - "Signature is SMILES string with canonical atoms ordering. Order of atoms calculated by Morgan-like algorithm.\n", - "\n", - "For signature generation one need to call `str` function on MoleculeContainer object.\n", - "Fixed length hash of signature could be retrieved by calling `bytes` function on molecule (correspond to SHA 512 bitstring).\n", - "\n", - "Next string formatting keys supported:\n", - "\n", - "a - Generate asymmetric closures.\n", - "!s - Disable stereo marks.\n", - "A - Use aromatic bonds instead aromatic atoms.\n", - "m - Set atom mapping.\n", - "r - Generate random-ordered smiles.\n", - "h - Show implicit hydrogens.\n", - "!b - Disable bonds tokens." - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "from chython import smiles # smiles string parser\n", - "\n", - "m = smiles('c1ccccc1C=2C=CC=CC=2[C@H](O)C')\n", - "str(m) # signature\n", - "bytes(m) # cryptographic signature hash\n", - "hash(m) # runtime-dependent signature hash. See Python str hash behavior\n", - "\n", - "print(m)\n", - "print(f'f string {m}') # use signature in string formatting\n", - "print('C-style string %s' % m)\n", - "print('format method {}'.format(m))\n", - "print(f'{m:A}')\n", - "print(f'{m:h!b}') # combination supported" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "Molecules comparable and hashable\n", - "\n", - "Comparison of MoleculeContainer is based on its signatures. Moreover, since strings in Python are hashable, MoleculeContaier also hashable.\n", - "\n", - "NOTE: MoleculeContainer can be changed. This can lead to unobvious behavior of the sets and dictionaries in which these molecules were placed before the change. Avoid changing molecules (standardize, aromatize, hydrogens and atoms/bonds changes) placed inside sets and dictionaries." - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "m != smiles('c1ccccc1')" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "# Simplest way to exclude duplicated structures\n", - "len({m, m, smiles('c1ccccc1')}) == 2 # create set of unique molecules. Only 2 of them were different." - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## 2.2. Reaction signatures\n", - "ReactionContainer have its signature. Signature is SMIRKS string in which molecules of reactants, reagents, products presented in canonical order.\n", - "\n", - "API is the same as for molecules\n", - "\n", - "Next extra formatting keys supported:\n", - "\n", - "!c - Keep nested containers order\n", - "!C - skip cxsmiles fragments contract" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "print(r)" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "format(r, '!c')" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "# 3. Structure standardization\n", - "\n", - "## 3.1. Molecules\n", - "\n", - "MoleculeContainer has `standardize`, `kekule`, `thiele`, `neutralize`, `implicify_hydrogens`, `explicify_hidrogens` and `canonicalize` methods.\n", - "\n", - "Method `thiele` transforms Kekule representation of rings into aromatized.\n", - "Method `standardize` applies functional group standardization rules to molecules (more than 80 rules).\n", - "\n", - "Method `canonicalize` apply set of methods: `neutralize`, `standardize`, `kekule`, `implicify_hydrogens`, `thiele`" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "m = smiles('c1ccccc1N(=O)=O')\n", - "m.clean2d() # calculate 2d layout\n", - "m # depict" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "m.kekule() # transform to kekule form\n", - "m" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "m.standardize() # fix groups\n", - "m" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "m.thiele() # transform to aromatized form\n", - "m" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "m = smiles('[NH3+]CC(=O)[O-]')\n", - "m.clean2d()\n", - "m.neutralize() # fix zwitter-ions\n", - "m" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "Molecules has `explicify_hydrogens` and `implicify_hydrogens` methods to handle hydrogens.\n", - "\n", - "This methods is used to add or remove hydrogens in molecule.\n", - "\n", - "Note `implicify_hydrogens` working for aromatic rings only in `kekule` form. `explicify_hydrogens` for `aromatized` forms required `kekule` and optionally `thiele` procedures applied before." - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "print(m.explicify_hydrogens()) # return number of added hydrogens\n", - "m.clean2d()\n", - "m" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "m.implicify_hydrogens()\n", - "m" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "To use GPU for AAM calculations, specify device:\n", - "\n", - " import chython\n", - " chython.torch_device = 'cuda'\n", - "\n", - "Note: `reset_mapping` loads torch neural network once. So, it is impossible to change device on the fly. Do it before first call of `reset_mapping`! To parallelize AAM with multiprocessing, call `reset_mapping` only in workers, to avoid bottleneck with single GPU model." - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "m = smiles('C=N=Cc1ccccc1')\n", - "print('errors:', m.check_valence()) # atoms with valence problems. aromatic rings should be kekulized (canonicaqlized) to check problems\n", - "m.canonicalize()\n", - "print('errors:', m.check_valence())\n", - "m.clean2d()\n", - "m" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## 3.2. Reactions\n", - "ReactionContainer has same methods as molecules. In this case they are applied to all molecules in reaction.\n", - "\n", - "`explicify_hydrogen` method try to keep atom-to-atom mapping.\n", - "\n", - "Reaction specific methods:\n", - "\n", - "* `remove_reagents` - move reactants to reagents. based on atom-to-atom mapping.\n", - "* `contract_ions` - merge ions in single multicomponent molecule.\n", - "* `reset_mapping` - perfom atom-to-atom mapping. Required chytorch-rxnmap package.\n", - "* `fix_mapping` - rule based atom-to-atom mapping fix for known mistakes." - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "r = smiles('[Na+:1].[OH-:2].[CH3:7][O:5][C:4]([CH3:3])=[O:6]>>[CH3:3][C:4]([OH:8])=[O:6]') # mapping required\n", - "r.clean2d()\n", - "r" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "r.contract_ions()\n", - "r" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "r.remove_reagents(keep_reagents=True)\n", - "r" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "r.explicify_hydrogens()\n", - "r.clean2d()\n", - "r" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "r = smiles('OC(=O)C(=C)C=C.C=CC#N>>OC(=O)C1=CCCC(C1)C#N')\n", - "r.clean2d()\n", - "r.reset_mapping()\n", - "r" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "# 4. Isomorphism\n", - "\n", - "## 4.1. Molecules Isomorphism\n", - "\n", - "Chython has simple substructure/structure isomorphism API.\n", - "\n", - "Note, that atoms are matched in subgraph isomorphism only if they have same charge/multiplicity and isotope options." - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "benzene = smiles('c1ccccc1')\n", - "toluene = smiles('c1ccccc1C')\n", - "# isomorphism operations\n", - "print(benzene < toluene) # benzene is substructure of toluene\n", - "print(benzene > toluene) # benzene is not superstructure of toluene\n", - "print(benzene <= toluene) # benzene is substructure/or same structure of toluene\n", - "print(benzene >= toluene) # benzene is not superstructure/or same structure of toluene\n", - "print(benzene < benzene) # benzene is not substructure of benzene. it's equal\n", - "print(benzene <= benzene)" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "Mappings of substructure or structure to structure can be returned using `substructure.get_mapping(structure)` method. Method acts as generator." - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "next(benzene.get_mapping(toluene))" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "for m in benzene.get_mapping(toluene, automorphism_filter=False): # iterate over all possible substructure mappings\n", - " print(m)" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## 4.2. Queries\n", - "\n", - "Queries (QueryContainer) is special objects which additionally takes into account neighbors, hybridization, hydrogen count, ring size and heteroatom neighbors count state of atoms and bond in ring state.\n", - "\n", - "Queries can be generated from molecules by `substructure` method with as_query argument.\n", - "\n", - "Few special arguments for controlling atom state available." - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "m = smiles('NCC(=O)O')\n", - "carboxy = m.substructure([3, 4, 5], as_query=True, skip_neighbors_marks=False, skip_hybridizations_marks=False, skip_hydrogens_marks=False, skip_rings_sizes_marks=False)\n", - "print(carboxy)" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "carboxy < m" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "carboxy < smiles('NCC(=O)OC') # not acid!" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "### 4.2.1. Query building API\n", - "\n", - "It is possible to build query and molecule objects in programming way" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "from chython import QueryContainer, MoleculeContainer\n", - "from chython.containers.bonds import QueryBond\n", - "from chython.periodictable import ListElement" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "q = QueryContainer() # create empty query\n", - "q.add_atom('C', neighbors=3, hybridization=2, heteroatoms=1, rings_sizes=0, hydrogens=0)\n", - "q.add_atom(ListElement(['O', 'S']), n=3) # oxygen or sulphur, with atom number 3\n", - "q.add_bond(1, 3, 2) # atoms enumerated from 1. connect first and 3rd atom by bouble bond.\n", - "print(q) # match only acyclic [tia] ketones" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "q < smiles('CC(=O)O')" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "q < smiles('CC(=S)C')" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "q < smiles('CC=O')" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "q < smiles('C1CC(=O)CC1')" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "q = QueryContainer()\n", - "q.add_atom('C', rings_sizes=6, hybridization=4)\n", - "q.add_atom('C', rings_sizes=6, hybridization=4)\n", - "q.add_bond(1, 2, QueryBond(1, False)) # QueryBond(order, in_ring)\n", - "print(q) # match ring-ring linker" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "q < smiles('C1Cc2ccccc2-c2ccccc12')" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "q < smiles('c1ccc(cc1)-c1ccccc1')" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "q < smiles('C1CC(=O)CC1')" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "source": [ - "q = QueryContainer()\n", - "q.add_atom('C', rings_sizes=6, hybridization=4)\n", - "q.add_atom('C', rings_sizes=6, hybridization=4)\n", - "q.add_bond(1, 2, QueryBond(1, False)) # QueryBond(order, in_ring)\n", - "print(q) # match ring-ring linker" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "source": [ - "q < smiles('C1Cc2ccccc2-c2ccccc12')" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "source": [ - "q < smiles('c1ccc(cc1)-c1ccccc1')" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "Molecules construction API the same, except extra query attributes" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "m = MoleculeContainer()\n", - "m.add_atom('C')\n", - "m.add_atom('C')\n", - "m.add_atom('O')\n", - "m.add_bond(1, 2, 1)\n", - "m.add_bond(2, 3, 2)\n", - "m.clean2d()\n", - "m" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "# 5. Reactor\n", - "\n", - "Reactor works similar to ChemAxon Reactions enumeration.\n", - "\n", - "Example here presents application of it to create esters from acids and alcoholes.\n", - "\n", - "First we need to construct carboxy group and alcohole matcher queries. Then, ether group need to be specified. \n", - "\n", - "Atom numbers in query and patch should be mapped to each other. The same atoms should have same numbers." - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "acid = QueryContainer()\n", - "acid.add_atom('C')\n", - "acid.add_atom('O', neighbors=1)\n", - "acid.add_atom('O')\n", - "acid.add_bond(1, 2, 1)\n", - "acid.add_bond(1, 3, 2)\n", - "print(acid)" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "alco = QueryContainer()\n", - "alco.add_atom('C', n=4, heteroatoms=1) # set atom number manually\n", - "alco.add_atom('O', 5, neighbors=1)\n", - "alco.add_bond(4, 5, 1)\n", - "print(alco)" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "ether = QueryContainer()\n", - "ether.add_atom('C')\n", - "ether.add_atom('O', 3)\n", - "ether.add_atom('C')\n", - "ether.add_atom('O')\n", - "ether.add_bond(1, 3, 2)\n", - "ether.add_bond(1, 5, 1)\n", - "ether.add_bond(4, 5, 1)\n", - "print(ether)" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "from chython import Reactor\n", - "from chython.utils import grid_depict\n", - "from ipywidgets import HTML" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "rxn = Reactor([acid, alco], [ether], delete_atoms=True, one_shot=False)\n", - "# delete atoms not presented in product query\n", - "# do multiple reactions if possible" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "alcohols = [smiles('CO'), smiles('CCO'), smiles('CC(C)O')]\n", - "acids = [smiles('C(=O)O'), smiles('CC(=O)O'), smiles('OC(=O)C(=O)O')]\n", - "for x in alcohols + acids:\n", - " x.clean2d()" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "HTML(grid_depict(alcohols + acids))" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "from itertools import product\n", - "\n", - "products = []\n", - "for x in product(acids, alcohols):\n", - " for p in rxn(x): # apply transformation on given list of reactants\n", - " p.clean2d()\n", - " products.append(p)\n", - "len(products)" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "products[0]" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "products[-3]" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 73, - "metadata": { - "pycharm": { - "name": "#%%\n" - }, - "scrolled": true - }, - "source": [ - "products[-4]" - ], - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "# 6. Molecules and Reactions API\n", - "\n", - "There are explanation of some methods" - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "anion, cation = smiles('[Cl-].[Na+]').split() # disconnected components can be split\n", - "print(anion, cation)\n", - "salt = anion | cation # molecules can be merged\n", - "salt = anion.union(cation, remap=True) # fix mapping overlap\n", - "salt.clean2d()\n", - "salt" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 75, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "m = toluene.substructure([1, 2, 3, 4, 5, 6]) # extraction of substructure\n", - "# set recalculate_hydrogens=False to save hydrogen count info. useful for full component extraction.\n", - "m.clean2d()\n", - "print(m.atom(1).atomic_symbol, m.atom(1).implicit_hydrogens) # aromatic structures require kekule>thiele procedure to fix hydrogens count\n", - "m.kekule() and m.thiele()\n", - "print(m.atom(1).atomic_symbol, m.atom(1).implicit_hydrogens)\n", - "m" - ], - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 76, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "source": [ - "remapped = m.remap({1: 7}, copy=True) # change atom numbers\n", - "remapped" - ], - "outputs": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "python3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/doc/utils.rst b/doc/utils.rst deleted file mode 100644 index df393018..00000000 --- a/doc/utils.rst +++ /dev/null @@ -1,9 +0,0 @@ -chython\.utils package -====================== - -Utils for data transformation, depiction etc. - -.. automodule:: chython.utils - :members: - :undoc-members: - :inherited-members: diff --git a/pyproject.toml b/pyproject.toml index 3244f4f1..8c3ffc58 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = 'chython' -version = '2.16' +version = '2.17' description = 'Library for processing molecules and reactions in python way' authors = ['Ramil Nugmanov '] license = 'LGPLv3' @@ -63,10 +63,8 @@ pytest = '>=7.4.3' optional = true [tool.poetry.group.docs.dependencies] -nbsphinx = '>=0.9.3' -pandoc = '>=2.3' -rdkit = '>=2023.9.1' -ipython = '>=8.12.1' +sphinx = '>=7.0' +furo = '>=2024.1' [build-system] requires = ['poetry-core', 'setuptools', 'cython>=3.0.5']