{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from rdkit import Chem\n",
    "from rdkit.Chem import rdMolDescriptors\n",
    "import urllib2 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "smiles = ['c1cC(=O)c(C)cN1',\n",
    "          'C(=O)(O)Cc1ccccc1C(=O)NC(Cl)N', \n",
    "          'CN(C)CC/C=C1C2=C(C=CC=C2)OCC3=C/1C=CC=C3 ', \n",
    "          'O=S(=O)(c3ccc(n1nc(cc1c2ccc(cc2)C)C(F)(F)F)cc3)N'\n",
    "         ]\n",
    "mols = [Chem.MolFromSmiles(m) for m in smiles]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "pattyFileURL = 'http://tripod.nih.gov/files/patty.rules'\n",
    "def parsePattyFile():\n",
    "    rules = []\n",
    "    patty = urllib2.urlopen(pattyFileURL)\n",
    "    for line in patty:\n",
    "        if not line:\n",
    "            continue\n",
    "        if line.startswith('#'):\n",
    "            continue\n",
    "        pattern = line.split()\n",
    "        if len(pattern) > 0:\n",
    "            rules.append([Chem.MolFromSmarts(pattern[0]),pattern[1]])\n",
    "    intMap = {n:i for i, n in enumerate(set([n for _, n in rules]))}  \n",
    "    for i in range(len(rules)):\n",
    "        rules[i][1]=intMap[rules[i][1]]\n",
    "    return rules, intMap\n",
    "\n",
    "pattyRules, pattyIntMap = parsePattyFile()\n",
    "def getPattyInvariant(mol):\n",
    "    mMatch = [0] * mol.GetNumAtoms()\n",
    "    for p, n in pattyRules:\n",
    "        match = mol.GetSubstructMatches(p)\n",
    "        if match:\n",
    "            for m in match:\n",
    "                mMatch[m[0]] = n\n",
    "    return mMatch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "14 28\n",
      "52 120\n",
      "33 210\n",
      "62 325\n"
     ]
    }
   ],
   "source": [
    "for mol in mols:\n",
    "    desc = rdMolDescriptors.GetAtomPairFingerprint(mol, atomInvariants=getPattyInvariant(mol))\n",
    "    print len(desc.GetNonzeroElements().values()), sum(desc.GetNonzeroElements().values())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
