{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup a PostgreSQL database with the RDKit cartridge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# common usage\n",
    "import pandas as pd # dataframes\n",
    "# postgresql\n",
    "import psycopg2 # query postgresql database\n",
    "# chemoinformatics\n",
    "from rdkit import rdBase\n",
    "from rdkit import Chem # main RDKit module"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# establish connection to the database\n",
    "conn = psycopg2.connect(dbname=\"chembl_24_dev\",user=\"gally\")\n",
    "cur = conn.cursor()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Versions:\n",
      "\n",
      "RDKit: 2018.09.1\n",
      "PostgreSQL 10.5 on x86_64-pc-linux-gnu, compiled by x86_64-conda_cos6-linux-gnu-cc (crosstool-NG fa8859cb) 7.2.0, 64-bit\n"
     ]
    }
   ],
   "source": [
    "# print versions\n",
    "print(\"Versions:\\n\")\n",
    "print(\"RDKit: {}\".format(rdBase.rdkitVersion))\n",
    "# for postgresql we need to setup a cursor first\n",
    "cur.execute('SELECT version()')\n",
    "db_version = cur.fetchone()\n",
    "print(db_version[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# I copied/pasted the below commands from https://www.rdkit.org/docs/Cartridge.html\n",
    "# chembl_24_dev is a subset from the chembl_24 posgresql database\n",
    "# I replaced rdk.mols by rdk2.mols for this example\n",
    "sql=\"\"\"\n",
    "create extension if not exists rdkit;\n",
    "create schema rdk2;\n",
    "select * into rdk2.mols from (select molregno,mol_from_ctab(molfile::cstring) m  from compound_structures) tmp where m is not null;\n",
    "create index molidx on rdk2.mols using gist(m);\n",
    "alter table rdk2.mols add primary key (molregno);\n",
    "select molregno,torsionbv_fp(m) as torsionbv,morganbv_fp(m) as mfp2,featmorganbv_fp(m) as ffp2 into rdk2.fps from rdk2.mols;\n",
    "create index fps_ttbv_idx on rdk2.fps using gist(torsionbv);\n",
    "create index fps_mfp2_idx on rdk2.fps using gist(mfp2);\n",
    "create index fps_ffp2_idx on rdk2.fps using gist(ffp2);\n",
    "alter table rdk2.fps add primary key (molregno);\n",
    "create or replace function get_mfp2_neighbors(smiles text)\n",
    "returns table(molregno bigint, m mol, similarity double precision) as\n",
    "$$\n",
    "select molregno,m,tanimoto_sml(morganbv_fp(mol_from_smiles($1::cstring)),mfp2) as similarity\n",
    "from rdk2.fps join rdk2.mols using (molregno)\n",
    "where morganbv_fp(mol_from_smiles($1::cstring))%mfp2\n",
    "order by morganbv_fp(mol_from_smiles($1::cstring))<%>mfp2;\n",
    "$$ language sql stable ;\n",
    "\"\"\"\n",
    "cur.execute(sql)\n",
    "conn.commit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>molregno</th>\n",
       "      <th>m</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>194848</td>\n",
       "      <td>CC(C)(C)c1cc(C(N)=O)cc(C(C)(C)C)c1O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1879935</td>\n",
       "      <td>CN(Cc1nnc2n1CCC2)S(=O)(=O)c1ccc(Br)cc1Cl</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>243213</td>\n",
       "      <td>O=C1N(Cc2ccc3ccccc3c2)[C@H]([C@@H](F)Cc2ccccc2...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2069682</td>\n",
       "      <td>COc1cc(/C=C(/C(=O)N2CCC=CC2=O)c2ccc(F)cc2)cc(O...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>772447</td>\n",
       "      <td>CCS(=O)(=O)N1CCN(c2cc3c(cc2[N+](=O)[O-])n(C)c(...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>215640</td>\n",
       "      <td>CCCCn1c(=O)c2ncn(C)c2c2cc(C)ccc21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1786279</td>\n",
       "      <td>N#CC1=C(N)N(c2cccnc2)C2=C(C(=O)CCC2)C1c1cc2cc(...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>137566</td>\n",
       "      <td>CC(C)(C)c1cc(NC(=O)NC2CCCCC2)n(-c2ccccc2)n1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>813327</td>\n",
       "      <td>O=C(CCCn1c(=O)[nH]c2ccccc2c1=O)N1CCN(Cc2ccc3c(...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>729364</td>\n",
       "      <td>NC(=O)Cc1ccc(C(=O)c2ccc(Cl)cc2)s1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   molregno                                                  m\n",
       "0    194848                CC(C)(C)c1cc(C(N)=O)cc(C(C)(C)C)c1O\n",
       "1   1879935           CN(Cc1nnc2n1CCC2)S(=O)(=O)c1ccc(Br)cc1Cl\n",
       "2    243213  O=C1N(Cc2ccc3ccccc3c2)[C@H]([C@@H](F)Cc2ccccc2...\n",
       "3   2069682  COc1cc(/C=C(/C(=O)N2CCC=CC2=O)c2ccc(F)cc2)cc(O...\n",
       "4    772447  CCS(=O)(=O)N1CCN(c2cc3c(cc2[N+](=O)[O-])n(C)c(...\n",
       "5    215640                  CCCCn1c(=O)c2ncn(C)c2c2cc(C)ccc21\n",
       "6   1786279  N#CC1=C(N)N(c2cccnc2)C2=C(C(=O)CCC2)C1c1cc2cc(...\n",
       "7    137566        CC(C)(C)c1cc(NC(=O)NC2CCCCC2)n(-c2ccccc2)n1\n",
       "8    813327  O=C(CCCn1c(=O)[nH]c2ccccc2c1=O)N1CCN(Cc2ccc3c(...\n",
       "9    729364                  NC(=O)Cc1ccc(C(=O)c2ccc(Cl)cc2)s1"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cur.execute(\"SELECT * FROM rdk2.mols WHERE m@>'c1ccccc1' LIMIT 10;\")\n",
    "res = cur.fetchall()\n",
    "df = pd.DataFrame(res)\n",
    "df.columns = [ c[0] for c in cur.description ] # get column index\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "str"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(df.iloc[0]['m'])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
