USPTO¶
In [1]:
Copied!
%%time
from IPython.core.display import HTML
import retropaths.helper_functions as hf
from ipywidgets import interact
from retropaths.database.my_test import MyTest
from retropaths.database.reaction_smiles_entry import ReactionSmilesEntry, EntryNotValidError
library = hf.pload('../data/reactions.p')
df = hf.pload('../data/USPTO.p')
HTML('<script src="//d3js.org/d3.v3.min.js"></script>')
%%time
from IPython.core.display import HTML
import retropaths.helper_functions as hf
from ipywidgets import interact
from retropaths.database.my_test import MyTest
from retropaths.database.reaction_smiles_entry import ReactionSmilesEntry, EntryNotValidError
library = hf.pload('../data/reactions.p')
df = hf.pload('../data/USPTO.p')
HTML('')
CPU times: user 7.68 s, sys: 1.32 s, total: 9 s Wall time: 9.18 s
Out[1]:
In [2]:
Copied!
# This is the entire USPTO database in pandas dataframe
df
# This is the entire USPTO database in pandas dataframe
df
Out[2]:
| ReactionSmiles | PatentNumber | ParagraphNum | Year | TextMinedYield | CalculatedYield | |
|---|---|---|---|---|---|---|
| 0 | [C:1]([C:5]1[CH:10]=[CH:9][C:8]([OH:11])=[CH:7... | US20010000035A1 | 7.0 | 2001 | NaN | NaN |
| 1 | [Cl-].[Al+3].[Cl-].[Cl-].[Cl:5][CH2:6][CH2:7][... | US20010000038A1 | 256.0 | 2001 | 86% | 86.9% |
| 2 | [Al+3].[Cl-].[Cl-].[Cl-].[Cl:5][CH2:6][CH2:7][... | US20010000038A1 | 259.0 | 2001 | 95% | NaN |
| 3 | [Cl:1][CH2:2][CH2:3][CH2:4][C:5]([C:7]1[CH:12]... | US20010000038A1 | 285.0 | 2001 | NaN | NaN |
| 4 | [Cl:1][CH2:2][CH2:3][CH2:4][C:5]([C:7]1[CH:12]... | US20010000038A1 | 287.0 | 2001 | 100% | 100.9% |
| ... | ... | ... | ... | ... | ... | ... |
| 1939248 | [NH:1]1[C:9]2[C:4](=[CH:5][CH:6]=[C:7]([NH:10]... | US20160272617A1 | 2361 | 2016 | NaN | NaN |
| 1939249 | [OH:1][CH:2]1[CH2:5][CH:4]([NH:6][C:7]([C:9]2[... | US20160272617A1 | 2365 | 2016 | NaN | NaN |
| 1939250 | C(OC([NH:8][CH2:9][C@H:10]1[CH2:15][CH2:14][C@... | US20160272617A1 | 2373 | 2016 | NaN | NaN |
| 1939251 | [CH:1]1([NH:4][C:5]([C:7]2[CH:12]=[CH:11][C:10... | US20160272617A1 | 2377 | 2016 | NaN | NaN |
| 1939252 | [OH-].[Li+].FC(F)(F)C(O)=O.[NH2:10][CH2:11][C@... | US20160272617A1 | 2381 | 2016 | NaN | NaN |
1939253 rows × 6 columns
In [3]:
Copied!
from retropaths.database.reaction_smiles_entry import MyTestOptions
mto = MyTestOptions() # default options.
mto
from retropaths.database.reaction_smiles_entry import MyTestOptions
mto = MyTestOptions() # default options.
mto
Out[3]:
MyTestOptions(maximum_weight=300, allowed_atoms=['C', 'O', 'H', 'S', 'N', 'Na', 'F', 'Cl', 'Br', 'I', 'P'], not_allowed_atoms=['Ga', 'Zc', 'V', 'Ce', 'Ir', 'Te', 'Re', 'Tr', 'Tl', 'In', 'Mo', 'W', 'Be', 'As', 'Ba', 'Bi', 'Au', 'Sc', 'B', 'Mn', 'Cr', 'Si', 'Sn', 'Ag', 'Ni', 'Cu', 'Pb', 'Ru', 'Se', 'Rh', 'Hg', 'Sm', 'Os', 'Fe', 'Pd', 'Pt', 'Al', 'Zn', 'Ti', 'Co', 'Yb', 'Mg', 'Cs', 'Zr', 'Eu', 'Sb', 'La', 'Ar', 'Cd', 'Dy', 'Y', 'Ta', 'Sr', 'Rb', 'Er'])
In [4]:
Copied!
df.iloc[229917]
df.iloc[229917]
Out[4]:
ReactionSmiles [OH:1][C:2]1[CH:3]=[C:4]([NH:8][C:9]2[N:14]=[C... PatentNumber US20050038243A1 ParagraphNum 0700 Year 2005 TextMinedYield NaN CalculatedYield NaN Name: 229917, dtype: object
In [7]:
Copied!
# this cell picks a random entry and tries to get a ReactionSmilesEntry object.
# At the creation of this object all the various validations are triggered.
from retropaths.database.my_test import MyTest
from retropaths.database.reaction_smiles_entry import ReactionSmilesEntry, EntryNotValidError
import random
n = 3298
n = 1913164
n = 229917
#comment two lines if you want to have a no random n
n = random.randint(0, len(df))
print(f'{n=}')
row = df.iloc[n]
string = row['ReactionSmiles']
print(f'Reaction string:\n{string}\n')
try:
rse = ReactionSmilesEntry.from_dataframe_row(row) # one can use a custom MyTestOptions here.
print(rse)
print(f'\nEntry ACCEPTED')
except EntryNotValidError as e:
print(ReactionSmilesEntry.print_from_string(string))
print(f'\nEntry not valid -> {e}')
rse = None
ReactionSmilesEntry.draw_entry_from_string(string)
# this cell picks a random entry and tries to get a ReactionSmilesEntry object.
# At the creation of this object all the various validations are triggered.
from retropaths.database.my_test import MyTest
from retropaths.database.reaction_smiles_entry import ReactionSmilesEntry, EntryNotValidError
import random
n = 3298
n = 1913164
n = 229917
#comment two lines if you want to have a no random n
n = random.randint(0, len(df))
print(f'{n=}')
row = df.iloc[n]
string = row['ReactionSmiles']
print(f'Reaction string:\n{string}\n')
try:
rse = ReactionSmilesEntry.from_dataframe_row(row) # one can use a custom MyTestOptions here.
print(rse)
print(f'\nEntry ACCEPTED')
except EntryNotValidError as e:
print(ReactionSmilesEntry.print_from_string(string))
print(f'\nEntry not valid -> {e}')
rse = None
ReactionSmilesEntry.draw_entry_from_string(string)
n=336457
Reaction string:
Cl.[CH2:2]([N:4]([CH2:30][CH3:31])[C:5]([C:7]1[CH:8]=[CH:9][CH:10]=[C:11]2[C:15]=1[NH:14][CH:13]=[C:12]2[CH2:16][C@H:17]([NH:19][CH2:20][C@@H:21]([C:23]1[CH:28]=[CH:27][CH:26]=[C:25]([Cl:29])[CH:24]=1)[OH:22])[CH3:18])=[O:6])[CH3:3].C(N(CC)CC)C.Cl[C:40](Cl)([O:42]C(=O)OC(Cl)(Cl)Cl)Cl.C(=O)([O-])O.[Na+]>O1CCCC1>[CH2:30]([N:4]([CH2:2][CH3:3])[C:5]([C:7]1[CH:8]=[CH:9][CH:10]=[C:11]2[C:15]=1[NH:14][CH:13]=[C:12]2[CH2:16][C@H:17]([N:19]1[CH2:20][C@@H:21]([C:23]2[CH:28]=[CH:27][CH:26]=[C:25]([Cl:29])[CH:24]=2)[O:22][C:40]1=[O:42])[CH3:18])=[O:6])[CH3:31] |f:0.1-4.5|
Reactants:
Cl
CCN(CC)C(=O)c1cccc2c1[nH]cc2CC(C)NCC(c3cccc(c3)Cl)O Numbered
C(N(CC)CC)C
C(=O)(OC(Cl)(Cl)Cl)OC(Cl)(Cl)Cl Numbered
C(=O)([O-])O
[Na+]
Arrow:
O1CCCC1
Products:
CCN(CC)C(=O)c1cccc2c1[nH]cc2CC(C)N3CC(OC3=O)c4cccc(c4)Cl Numbered
Entry ACCEPTED
Out[7]:
You can continue only if the entry is accepted.¶
In [8]:
Copied!
if rse is None:
raise hf.StopExecution('You should continue only if the test is accepted')
# if the entry was ACCEPTED, you can create a MyTest object, which will deal with the pot creations. Look how this object now can parse the string data into retropaths
mt = MyTest.from_reactions_smiles(string)
mt.draw(library=library)
if rse is None:
raise hf.StopExecution('You should continue only if the test is accepted')
# if the entry was ACCEPTED, you can create a MyTest object, which will deal with the pot creations. Look how this object now can parse the string data into retropaths
mt = MyTest.from_reactions_smiles(string)
mt.draw(library=library)
Out[8]:
USPTO test number: 0 - STATUS: EMPTY
Reactants
Products
Environment
In [9]:
Copied!
%%time
# this is the command to run a single pot without conditions filtering. This can be slow-ish.
mt.run_pots_without_conditions(library)
%%time
# this is the command to run a single pot without conditions filtering. This can be slow-ish.
mt.run_pots_without_conditions(library)
C(=O)(OC(Cl)(Cl)Cl)OC(Cl)(Cl)Cl.CCN(CC)C(=O)c1cccc2c1[nH]cc2CC(C)NCC(c3cccc(c3)Cl)O -> no_cond_0 did not finish in time with 90 seconds CPU times: user 1min 30s, sys: 250 ms, total: 1min 31s Wall time: 1min 31s
In [10]:
Copied!
# this draw the test object with the pots that calculated
mt.draw_with_pots(library=library)
# this draw the test object with the pots that calculated
mt.draw_with_pots(library=library)
Out[10]:
USPTO test number: 0 - STATUS: FOUND
Reactants
Products
Environment
Pot number 0 has FOUND the product molecule.
Pot name: no_cond_0 | Status: EMPTY
Acidity all_pH | Solvent any
Pot root
Pot environment
Pot reaction graph
Target Molecule
In [11]:
Copied!
mt
mt
Out[11]:
MyTest(index=0, reaction_smiles_entry=ReactionSmilesEntry(react=['Cl', '[CH2:2]([N:4]([CH2:30][CH3:31])[C:5]([C:7]1[CH:8]=[CH:9][CH:10]=[C:11]2[C:15]=1[NH:14][CH:13]=[C:12]2[CH2:16][C@H:17]([NH:19][CH2:20][C@@H:21]([C:23]1[CH:28]=[CH:27][CH:26]=[C:25]([Cl:29])[CH:24]=1)[OH:22])[CH3:18])=[O:6])[CH3:3]', 'C(N(CC)CC)C', 'Cl[C:40](Cl)([O:42]C(=O)OC(Cl)(Cl)Cl)Cl', 'C(=O)([O-])O', '[Na+]'], arrow=['O1CCCC1'], prod=['[CH2:30]([N:4]([CH2:2][CH3:3])[C:5]([C:7]1[CH:8]=[CH:9][CH:10]=[C:11]2[C:15]=1[NH:14][CH:13]=[C:12]2[CH2:16][C@H:17]([N:19]1[CH2:20][C@@H:21]([C:23]2[CH:28]=[CH:27][CH:26]=[C:25]([Cl:29])[CH:24]=2)[O:22][C:40]1=[O:42])[CH3:18])=[O:6])[CH3:31]'], entry_string='Cl.[CH2:2]([N:4]([CH2:30][CH3:31])[C:5]([C:7]1[CH:8]=[CH:9][CH:10]=[C:11]2[C:15]=1[NH:14][CH:13]=[C:12]2[CH2:16][C@H:17]([NH:19][CH2:20][C@@H:21]([C:23]1[CH:28]=[CH:27][CH:26]=[C:25]([Cl:29])[CH:24]=1)[OH:22])[CH3:18])=[O:6])[CH3:3].C(N(CC)CC)C.Cl[C:40](Cl)([O:42]C(=O)OC(Cl)(Cl)Cl)Cl.C(=O)([O-])O.[Na+]>O1CCCC1>[CH2:30]([N:4]([CH2:2][CH3:3])[C:5]([C:7]1[CH:8]=[CH:9][CH:10]=[C:11]2[C:15]=1[NH:14][CH:13]=[C:12]2[CH2:16][C@H:17]([N:19]1[CH2:20][C@@H:21]([C:23]2[CH:28]=[CH:27][CH:26]=[C:25]([Cl:29])[CH:24]=2)[O:22][C:40]1=[O:42])[CH3:18])=[O:6])[CH3:31] |f:0.1-4.5|', options=MyTestOptions(maximum_weight=300, allowed_atoms=['C', 'O', 'H', 'S', 'N', 'Na', 'F', 'Cl', 'Br', 'I', 'P'], not_allowed_atoms=['Ga', 'Zc', 'V', 'Ce', 'Ir', 'Te', 'Re', 'Tr', 'Tl', 'In', 'Mo', 'W', 'Be', 'As', 'Ba', 'Bi', 'Au', 'Sc', 'B', 'Mn', 'Cr', 'Si', 'Sn', 'Ag', 'Ni', 'Cu', 'Pb', 'Ru', 'Se', 'Rh', 'Hg', 'Sm', 'Os', 'Fe', 'Pd', 'Pt', 'Al', 'Zn', 'Ti', 'Co', 'Yb', 'Mg', 'Cs', 'Zr', 'Eu', 'Sb', 'La', 'Ar', 'Cd', 'Dy', 'Y', 'Ta', 'Sr', 'Rb', 'Er'])), data={}, status=<MyTestStatus.FOUND: 2>, pots=[POT C(=O)(OC(Cl)(Cl)Cl)OC(Cl)(Cl)Cl.CCN(CC)C(=O)c1cccc2c1[nH]cc2CC(C)NCC(c3cccc(c3)Cl)O -> Acidity all_pH | Solvent any], result_booleans=[True])
In [12]:
Copied!
# To see inside pot #n in details with every node written down (warning, this can be a huge canvas to draw)
@interact(pot=mt.pots)
def draw_pot(pot):
html = f"<h1>USPTO test number: {mt.index} - STATUS: {mt.status.name}</h1>"
try:
dft = pot.draw_from_target(string_mode=True)
except ValueError:
dft = '<h3>Target not found in this pot</h3>'
return HTML(html + pot.draw(string_mode=True) + dft)
# To see inside pot #n in details with every node written down (warning, this can be a huge canvas to draw)
@interact(pot=mt.pots)
def draw_pot(pot):
html = f"
USPTO test number: {mt.index} - STATUS: {mt.status.name}
" try: dft = pot.draw_from_target(string_mode=True) except ValueError: dft = 'Target not found in this pot
' return HTML(html + pot.draw(string_mode=True) + dft)interactive(children=(Dropdown(description='pot', options=(POT C(=O)(OC(Cl)(Cl)Cl)OC(Cl)(Cl)Cl.CCN(CC)C(=O)c1c…
In [ ]:
Copied!