This is an automated email from the ASF dual-hosted git repository. leerho pushed a commit to branch frankgrimes97-feature/maven-multi-module-per-java-version in repository https://gitbox.apache.org/repos/asf/datasketches-characterization.git
commit d611c61dae10d63f6ff5b95ed5326426f4edfdfc Author: Lee Rhodes <[email protected]> AuthorDate: Mon Aug 25 15:29:19 2025 -0700 Update python files --- src/notebooks/ExpansionProfile.ipynb | 908 +++++++++++++++++++++++ src/notebooks/FilterErrorProfile.ipynb | 1242 ++++++++++++++++++++++++++++++++ src/notebooks/README.md | 29 + src/notebooks/filterSizeProfile.ipynb | 1119 ++++++++++++++++++++++++++++ src/notebooks/qf_probability_sum.py | 38 + 5 files changed, 3336 insertions(+) diff --git a/src/notebooks/ExpansionProfile.ipynb b/src/notebooks/ExpansionProfile.ipynb new file mode 100644 index 0000000..4e86dae --- /dev/null +++ b/src/notebooks/ExpansionProfile.ipynb @@ -0,0 +1,908 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# False Positive Rate, Fingerprints, and Expansion\n", + "\n", + "We test the false positive rate and its relation to fingerprint and expansion." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from typing import Tuple\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "from qf_probability_sum import qf_probability_sum" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def parse_results(filename:str) -> Tuple[pd.DataFrame, float, int]:\n", + " \"\"\"\n", + " Takes an input file name in .txt format, strips out the extra strings and returns \n", + " the main results in a dataframe.\n", + "\n", + " The universe size is the largest power of 2 that is used to insert into the sketch and \n", + " the capacity is how far along the power of two interval used for maximum cardinality.\n", + "\n", + " Eg. capacity = 0.9 and lgU = 20 means that we insert 0.9 * 2^20 elements into the sketch.\n", + "\n", + " Parameters:\n", + " - filename (str): The path to the input file in .txt format.\n", + "\n", + " Returns:\n", + " - Tuple[pd.DataFrame, float, int]: A tuple containing the main results in a dataframe, \n", + " the number of queries, and the number of false positives per query.\n", + " \"\"\"\n", + " with open(filename) as f:\n", + " lines = f.readlines()[2:]\n", + " ignore_from_idx = lines.index(\"PROPERTIES:\\n\")\n", + " results = lines[:ignore_from_idx-1] # there is an empty line preceding the ignore_from_idx\n", + " results_clean = [r.strip(\"\\n\") for r in results]\n", + " results_df = pd.DataFrame(sub.split(\"\\t\") for sub in results_clean[1:])\n", + " results_df.columns = results_clean[0].split(\"\\t\")\n", + " for col in results_df.columns:\n", + " results_df[col] = pd.to_numeric(results_df[col])\n", + " num_queries = None\n", + " for line in lines:\n", + " if \"lgNumQueries\" in line:\n", + " num_queries = 2.**int(line.split(\"=\")[-1])\n", + " break\n", + " results_df[\"trialFPR\"] = results_df[\"NumFalsePositives\"] / num_queries\n", + " return results_df, int(num_queries)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. False Positive Rate Bound for a fixed fingerprint\n", + "\n", + "First, we check the FPR bound of $2^{-f}$ for a single fingerprint.\n", + "The configuration for this experiment is: $m = 2^{21}$ slots, and four points in the octave between $2^{20}$ and $2^{21}$.\n", + "For each trial, we generate a new input and query set $Q$ and count the number of false positives when testing the query set against the filter.\n", + "The query set has $|Q| = 4096$.\n", + "\n", + "The number of false positives reported is approximately a Binomial distribution with $Q$ trials and probability $p$ of reporting a false positive as the Binomial success probability.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "single_fingerprint_df, num_queries = parse_results(\"QuotientFilterExpansionProfile20240813_090828PST.txt\")\n", + "single_fprint = single_fingerprint_df[\"FPrintLen\"].values[0]\n", + "\n", + "# ignore any fingerprints that are not the same length as the first fingerprint\n", + "single_fingerprint_df = single_fingerprint_df[single_fingerprint_df[\"FPrintLen\"] == single_fprint]\n", + "single_fingerprint_df.head()\n", + "\n", + "input_values = single_fingerprint_df[\"NumInput\"].unique()\n", + "num_slots = single_fingerprint_df[\"NumSlots\"].unique()[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2097152, 4096)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "num_slots, num_queries" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def get_mean_entries_fpr(results_df:pd.DataFrame) -> np.ndarray:\n", + " \"\"\"\n", + " Calculate the mean number of entries and false positive rate for each combination of \"NumSlots\" and \"NumInput\" in the given DataFrame.\n", + "\n", + " Parameters:\n", + " - results_df (pd.DataFrame): The DataFrame containing the results.\n", + "\n", + " Returns:\n", + " - np.ndarray: An array whose columns are the number of slots, mean number of entries in filterinput points and the mean of the number of entries and trial FPR.\n", + " # first column is number of slots, second is number of entries, third is trial FPR\n", + " \"\"\"\n", + " means = []\n", + " for gg in results_df.groupby(\"NumSlots\"):\n", + " for gdf in gg[1].groupby(\"NumInput\"):\n", + " means.append([gg[0], gdf[1][\"NumEntries\"].mean(), gdf[1][\"trialFPR\"].mean(), gdf[1][\"trialFPR\"].median()])\n", + " return np.array(means)\n", + "\n", + "mean_num_entries_fpr = get_mean_entries_fpr(single_fingerprint_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[2.09715200e+06, 1.04042699e+06, 1.54996514e-02, 1.53808594e-02],\n", + " [2.09715200e+06, 1.08923099e+06, 1.62308216e-02, 1.61132812e-02],\n", + " [2.09715200e+06, 1.14030623e+06, 1.69944763e-02, 1.68457031e-02],\n", + " [2.09715200e+06, 1.19375406e+06, 1.77892447e-02, 1.78222656e-02],\n", + " [2.09715200e+06, 1.24968171e+06, 1.86212659e-02, 1.85546875e-02],\n", + " [2.09715200e+06, 1.30820458e+06, 1.94831491e-02, 1.95312500e-02],\n", + " [2.09715200e+06, 1.36943901e+06, 2.03975439e-02, 2.02636719e-02],\n", + " [2.09715200e+06, 1.43350899e+06, 2.13514566e-02, 2.12402344e-02],\n", + " [2.09715200e+06, 1.50054058e+06, 2.23572850e-02, 2.22167969e-02],\n", + " [2.09715200e+06, 1.57067088e+06, 2.34036446e-02, 2.31933594e-02],\n", + " [2.09715200e+06, 1.64403715e+06, 2.44902968e-02, 2.44140625e-02]])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mean_num_entries_fpr" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we will plot the error behaviour as the number of entries in the filter increases." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<matplotlib.legend.Legend at 0x14fbeb910>" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABIgAAAGeCAYAAAD/mzriAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeZwcVbnw8V9V9T77viRkEkgICZAFwr4FBGS5IAgqKgKKbMJFQUT0khASFOGCggtGUMGFKKiAV15BkS0gYUsIJiRk3zNrJrP03lV13j8605nOdE9mJtPTPTPPN5/+ZLrqdPfpU13VVU+f8xxNKaUQQgghhBBCCCGEEKOWnu0KCCGEEEIIIYQQQojskgCREEIIIYQQQgghxCgnASIhhBBCCCGEEEKIUU4CREIIIYQQQgghhBCjnASIhBBCCCGEEEIIIUY5CRAJIYQQQgghhBBCjHISIBJCCCGEEEII [...] + "text/plain": [ + "<Figure size 1400x400 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(14, 4))\n", + "\n", + "for i, g in enumerate(single_fingerprint_df.groupby(\"NumInput\")):\n", + " group = g[0]\n", + " group_data = g[1]\n", + " fprint_len = group_data[\"FPrintLen\"].values[0]\n", + " naive_bound = 2.**(-fprint_len)\n", + "\n", + " colour = \"C\" + str(group_data[\"FPrintLen\"].values[0] -3)\n", + " ax.scatter(group_data[\"NumEntries\"], group_data[\"NumFalsePositives\"], \n", + " color=colour, marker=\"o\", s=10, alpha=0.01)\n", + " \n", + "\n", + "# bound function -- plotted at the input values points because the absolute load is a random variable that deviates \n", + "# from a deterministic input cardinality.\n", + "ff = fprint_len*np.ones_like(input_values)\n", + "bound_fct = 2.**(-ff)\n", + "ax.plot(input_values, bound_fct*num_queries, \n", + " color=\"red\", marker=\"x\", label=r\"$2^{-f}\\cdot |\\{QuerySet\\}|$\", linestyle=\"--\")\n", + "ax.plot(input_values, (mean_num_entries_fpr[:,1]/mean_num_entries_fpr[:,0])*bound_fct*num_queries, \n", + " label=r\"$\\alpha 2^{-f} \\cdot |\\{QuerySet\\}|$\", color=\"black\", marker=\"x\", linestyle=\"--\")\n", + "\n", + "\n", + "# Mean operator is linear in this setting\n", + "ax.plot(mean_num_entries_fpr[:,1], mean_num_entries_fpr[:,2]*num_queries, label=r\"$Mean(FPR)\\cdot |\\{QuerySet\\}|$\")\n", + "\n", + "# Exact probability sum\n", + "exact_probs = np.array([qf_probability_sum(num_slots, n, fprint_len) for n in input_values])\n", + "ax.plot(input_values, exact_probs*num_queries, label=\"Exact Probability Sum\", color=\"magenta\", marker=\"*\", linestyle=\"--\")\n", + "\n", + "# # tidy the axes\n", + "ax.set_ylabel(\"Number of FPs\")\n", + "ax.set_xlabel(\"Number of input items: $n$\")\n", + "\n", + "slot_ticks = single_fingerprint_df[\"NumInput\"].unique().tolist()\n", + "ax.set_xticks(slot_ticks)\n", + "ax.ticklabel_format(style='plain')\n", + "ax.grid()\n", + "ax.legend(loc=\"upper center\", bbox_to_anchor=(0.5, 1.15), ncol=4)\n", + "\n", + "#fig.savefig(\"single_fp_results_count.pdf\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now subtract the mean to get a better idea of how the variance behaves as input items increase.\n", + "We see that the variance is mildly increasing. This is to be expected since each of the independent trials represents a draw from a $Binomial(Q, p)$ distribution\n", + "(where $p$ is the probability of a false positive) whose variance is proportional to $p(1-p)$ which is increasing on $(0,\\frac12)$. Indeed we see that the empirical standard deviation closely matches that predicted from the theory." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<matplotlib.legend.Legend at 0x14fbc3310>" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABI0AAAF3CAYAAAAo+bvsAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3hURdvA4d+WbHoPkEASCJ0QQu9IEwERBPQVBAvYu752bAgoip+I8loQa2wgohRBpEjvPbTQSUiA9N62nvn+WLJhKZK+2TC3Vy7JyezsszPZzTnPmaISQggkSZIkSZIkSZIkSZIk6RJqRwcgSZIkSZIkSZIkSZIk1T4yaSRJkiRJkiRJkiRJkiRdQSaNJEmSJEmSJEmSJEmSpCvIpJEkSZIkSZIkSZIkSZJ0BZk0kiRJkiRJkiRJkiRJkq4gk0aSJEmSJEmSJEmSJEnSFWTSSJIkSZIkSZIkSZIk [...] + "text/plain": [ + "<Figure size 1400x400 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(14, 4))\n", + "\n", + "num_fp_means = []\n", + "num_fp_stds = []\n", + "for i, g in enumerate(single_fingerprint_df.groupby(\"NumInput\")):\n", + " group = g[0]\n", + " group_data = g[1]\n", + " fprint_len = group_data[\"FPrintLen\"].values[0]\n", + " load_factor = group_data[\"NumEntries\"] / group_data[\"NumSlots\"]\n", + " fps_less_mean = group_data[\"NumFalsePositives\"]- group_data[\"NumFalsePositives\"].mean()\n", + " num_fp_means.append(fps_less_mean.mean())\n", + " num_fp_stds.append(fps_less_mean.std())\n", + " \n", + " colour = \"C\" + str(group_data[\"FPrintLen\"].values[0] -3)\n", + "\n", + " ax.scatter(group_data[\"NumEntries\"], fps_less_mean, \n", + " color=colour, marker=\".\", s=10, alpha=0.01)\n", + " \n", + "\n", + "# Binomial error terms\n", + "ax.plot(input_values, np.sqrt(exact_probs*(1-exact_probs)*num_queries), label=\"Binomial Error\", color=\"blue\", marker=\"+\", linestyle=\"--\")\n", + "ax.plot(input_values, -np.sqrt(exact_probs*(1-exact_probs)*num_queries), color=\"blue\", marker=\"+\", linestyle=\"--\")\n", + "\n", + "# Mean and standard deviations of FPs less mean\n", + "ax.plot(input_values, num_fp_means, label=\"Empirical mean\", color=\"red\", linestyle=\":\")\n", + "ax.plot(input_values, num_fp_stds, label=\"Empirical standard deviation\", color=\"red\", marker=\"x\", linestyle=\":\")\n", + "ax.plot(input_values, -np.array(num_fp_stds), color=\"red\", marker=\"x\", linestyle=\":\")\n", + "\n", + "# # tidy the axes\n", + "ax.set_ylabel(r\"$\\# \\text{FPs} - Mean(\\# \\text{FPs})$\")\n", + "ax.set_xlabel(\"Number of input items: $n$\")\n", + "\n", + "slot_ticks = single_fingerprint_df[\"NumInput\"].unique().tolist()\n", + "ax.set_xticks(slot_ticks)\n", + "ax.ticklabel_format(style='plain')\n", + "ax.grid()\n", + "ax.legend(loc=\"upper left\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[7.86102209, 7.90683742],\n", + " [8.05846448, 8.08716986],\n", + " [8.25690964, 8.27140107],\n", + " [8.4470204 , 8.45959971],\n", + " [8.63977779, 8.65182699],\n", + " [8.83408323, 8.84815939],\n", + " [8.99173992, 9.04866053],\n", + " [9.22347582, 9.25340337],\n", + " [9.4533194 , 9.46244994],\n", + " [9.73107549, 9.67586922],\n", + " [9.98112535, 9.89372946]])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# empirical and mathematical standard deviations\n", + "np.c_[num_fp_stds, np.sqrt(exact_probs*(1-exact_probs)*num_queries)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we use the same results to estimate the false positive rate by dividing out the number of query points normalising constant." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<matplotlib.legend.Legend at 0x165f0b910>" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABIYAAAJLCAYAAACSSZ1QAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3xc1Z3//9e9d7p6sy1b7qYb01tIMCQkEAibvqngNJaUXULYJbv5LiaAk7BkU0h+WQIhCXE2gbC76SQQSKGEGopNMcXGtlwkWVaXpt97z++PkcaWVSzbGo3K+6mHHtLce2bm43M113c+c87nWMYYg4iIiIiIiIiIzDh2sQMQEREREREREZHiUGJIRERERERERGSGUmJIRERERERERGSGUmJIRERERERERGSGUmJIRERERERERGSGUmJIRERERERERGSGUmJIRERERERERGSGChQ7ABGR6cDzPLLZ [...] + "text/plain": [ + "<Figure size 1400x600 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(14, 6))\n", + "for i, g in enumerate(single_fingerprint_df.groupby(\"NumInput\")):\n", + " group = g[0]\n", + " group_data = g[1]\n", + " fprint_len = group_data[\"FPrintLen\"].values[0]\n", + "\n", + " naive_bound = 2.**(-fprint_len)\n", + "\n", + " colour = \"C\" + str(group_data[\"FPrintLen\"].values[0] -3)\n", + " ax.scatter(group_data[\"NumEntries\"], group_data[\"NumFalsePositives\"]/num_queries, \n", + " color=colour, marker=\"o\", s=10, alpha=0.01)\n", + " \n", + "\n", + "# bound function\n", + "ff = fprint_len*np.ones_like(input_values)\n", + "bound_fct = 2.**(-ff)\n", + "ax.plot(input_values, bound_fct, \n", + " color=\"red\", marker=\"x\", label=r\"$2^{-f}\\cdot$\", linestyle=\"--\")\n", + "ax.plot(input_values, (mean_num_entries_fpr[:,1]/mean_num_entries_fpr[:,0])*bound_fct, \n", + " label=r\"$\\alpha 2^{-f}$\", color=\"black\", marker=\"x\", linestyle=\"--\")\n", + "\n", + "ax.plot(mean_num_entries_fpr[:,1], mean_num_entries_fpr[:,2], label=r\"$Mean(FPR)$\")\n", + "ax.plot(input_values, exact_probs, label=\"Exact Probability Sum\", color=\"magenta\", marker=\"*\", linestyle=\"--\")\n", + "\n", + "# tidy the axes\n", + "ax.set_ylabel(\"False Positive Rate\")\n", + "ax.set_xlabel(\"Number of Entries in Filter\")\n", + "\n", + "slot_ticks = single_fingerprint_df[\"NumInput\"].unique().tolist()\n", + "ax.set_xticks(slot_ticks)\n", + "ax.ticklabel_format(style='plain')\n", + "ax.grid()\n", + "ax.set_yscale(\"log\", base=2)\n", + "ax.legend(loc=\"upper center\", bbox_to_anchor=(0.5, 1.15), ncol=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.01550, 0.01526, 0.01550, 0.01526\n", + "0.01623, 0.01597, 0.01623, 0.01597\n", + "0.01699, 0.01670, 0.01699, 0.01671\n", + "0.01779, 0.01747, 0.01779, 0.01747\n", + "0.01862, 0.01827, 0.01862, 0.01827\n", + "0.01949, 0.01911, 0.01948, 0.01910\n", + "0.02041, 0.01999, 0.02040, 0.01998\n", + "0.02136, 0.02090, 0.02135, 0.02090\n", + "0.02236, 0.02186, 0.02236, 0.02186\n", + "0.02340, 0.02286, 0.02340, 0.02286\n", + "0.02450, 0.02390, 0.02449, 0.02389\n" + ] + }, + { + "data": { + "text/plain": [ + "<matplotlib.legend.Legend at 0x165f13310>" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABKoAAAJLCAYAAADHKH4OAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXxcZb348c85Z/aZTCZLs7dp0nRfaMtSimAVCkVFlqugoIKK6L0uKFzUy/0hsoi4ItyLiqiAeEVQQa7bRQoKIpQC3Zd0SZekbZJmz2RmMut5fn8MmTZdk3YmM5N+37zyKnPm5Mw358zMec73PM/30ZRSCiGEEEIIIYQQQgghskzPdgBCCCGEEEIIIYQQQoAkqoQQQgghhBBCCCFEjpBElRBCCCGEEEIIIYTICZKoEkIIIYQQQgghhBA5QRJVQgghhBBCCCGEECInSKJKCCGEEEIIIYQQQuQESVQJ [...] + "text/plain": [ + "<Figure size 1400x600 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(14, 6))\n", + "\n", + "fpr_means = []\n", + "fpr_stds = []\n", + "approx_bound = []\n", + "for i, g in enumerate(single_fingerprint_df.groupby(\"NumInput\")):\n", + " group = g[0]\n", + " group_data = g[1]\n", + " fprint_len = group_data[\"FPrintLen\"].values[0]\n", + " fps_less_mean = group_data[\"trialFPR\"]- group_data[\"trialFPR\"].mean()\n", + " fpr_means.append(fps_less_mean.mean())\n", + " fpr_stds.append(fps_less_mean.std())\n", + " mean_load = (group_data[\"NumEntries\"] / group_data[\"NumSlots\"]).mean()\n", + " p = mean_load*2.**(-fprint_len)\n", + " p_emp = group_data[\"trialFPR\"].mean()\n", + " print(f\"{p:.5f}, {p*(1-p):.5f}, {p_emp:.5f}, {p_emp*(1-p_emp):.5f}\")\n", + " approx_bound.append(mean_load*2.**(-fprint_len) - group_data[\"trialFPR\"].mean())\n", + "\n", + " colour = \"C\" + str(group_data[\"FPrintLen\"].values[0] -3)\n", + " ax.scatter(group_data[\"NumEntries\"], fps_less_mean, color=colour, marker=\"o\", s=10, alpha=0.01)\n", + "\n", + "\n", + "# empirical mean and standard deviations of FPs less mean\n", + "ax.plot(input_values, fpr_means, label=\"Empirical mean\", color=\"red\", marker=\"x\", linestyle=\":\")\n", + "ax.plot(input_values, fpr_stds, label=\"Empirical standard deviation\", color=\"red\", marker=\"x\", linestyle=\":\")\n", + "ax.plot(input_values, -np.array(fpr_stds), color=\"red\", marker=\"x\", linestyle=\":\")\n", + "\n", + "# add the simple bound back in\n", + "ax.plot(input_values, approx_bound, label=r\"$\\alpha 2^{-f} - \\text{Mean}(TrialFPR)$\", color=\"black\", marker=\"v\", markersize=5., linestyle=\"--\")\n", + "\n", + "# Binomial error terms\n", + "ax.plot(input_values, np.sqrt(exact_probs*(1-exact_probs)/num_queries), label=\"Binomial Error\", color=\"blue\", marker=\"+\", linestyle=\"--\")\n", + "ax.plot(input_values, -np.sqrt(exact_probs*(1-exact_probs)/num_queries), color=\"blue\", marker=\"+\", linestyle=\"--\")\n", + "\n", + "# tidy the axes\n", + "ax.set_ylabel(\"Empirical FPR - Mean(Empirical FPR)\")\n", + "ax.set_xlabel(\"Number of Entries in Filter\")\n", + "\n", + "slot_ticks = single_fingerprint_df[\"NumInput\"].unique().tolist()\n", + "ax.set_xticks(slot_ticks)\n", + "ax.ticklabel_format(style='plain')\n", + "ax.grid()\n", + "#ax.set_yscale(\"log\", base=2)\n", + "ax.legend(loc=\"upper center\", bbox_to_anchor=(0.5, 1.15), ncol=4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Overall, what we see from this plot is that a slightly more refined bound can be given to the user given information in the filter.\n", + "Namely, the load can be calculated and this value can be used to scale the $2^{-f}$ function. The plot above shows that the function \n", + "$\\alpha 2^{-f} - \\text{Mean}(TrialFPR)$ is centered on zero, so on average there is little difference between the two functions." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NumSlots NumInput NumEntries FPrintLen MedNumFPs MaxNumFPs nBadTrial MeanFPR stdFPR WorstFPR Bound \n", + "------------------------------------------------------------------------------------------\n", + "2097152 1048576 1040426.0 5 63.0 95 0 0.0155 0.0019 0.0232 0.0312 \n", + "2097152 1098167 1089232.0 5 66.0 99 0 0.0162 0.0020 0.0242 0.0312 \n", + "2097152 1150104 1140308.0 5 69.0 102 0 0.0170 0.0020 0.0249 0.0312 \n", + "2097152 1204498 1193754.0 5 73.0 105 0 0.0178 0.0021 0.0256 0.0312 \n", + "2097152 1261463 1249681.0 5 76.0 106 0 0.0186 0.0021 0.0259 0.0312 \n", + "2097152 1321123 1308203.0 5 80.0 108 0 0.0195 0.0022 0.0264 0.0312 \n", + "2097152 1383604 1369436.0 5 83.0 115 0 0.0204 0.0022 0.0281 0.0312 \n", + "2097152 1449041 1433508.0 5 87.0 119 0 0.0214 0.0023 0.0291 0.0312 \n", + "2097152 1517572 1500540.0 5 91.0 122 0 0.0224 0.0023 0.0298 0.0312 \n", + "2097152 1589344 1570671.0 5 95.0 128 0 0.0234 0.0024 0.0312 0.0312 \n", + "2097152 1664511 1644037.0 5 100.0 136 6 0.0245 0.0024 0.0332 0.0312 \n" + ] + } + ], + "source": [ + "print(f\"{'NumSlots':<10} {'NumInput':<10} {'NumEntries':<10} {'FPrintLen':<10} {'MedNumFPs':<10} {'MaxNumFPs':<10} {'nBadTrial':<10} {'MeanFPR':<10} {'stdFPR':<10} {'WorstFPR':<10} {'Bound':<10}\")\n", + "print(\"-\"*90)\n", + "\n", + "for g in single_fingerprint_df.groupby(\"NumInput\"):\n", + " \n", + " num_slots = g[1][\"NumSlots\"].values[0]\n", + " fprint_len = g[1][\"FPrintLen\"].values[0]\n", + " num_entries = g[1][\"NumEntries\"].median()\n", + " med_num_fps = g[1][\"NumFalsePositives\"].median()\n", + " max_num_fps = g[1][\"NumFalsePositives\"].max()\n", + " mean_fpr = g[1][\"trialFPR\"].mean()\n", + " std_fpr = g[1][\"trialFPR\"].std()\n", + " worst_fpr = g[1]['trialFPR'].max()\n", + " bound = 2.**(-fprint_len)\n", + " thld = np.floor(num_queries * bound).astype(int)\n", + " g[1][\"BadTrial\"] = g[1][\"NumFalsePositives\"] > thld\n", + " num_bad_trials = g[1][\"BadTrial\"].sum() \n", + " print(f\"{num_slots:<10} {g[0]:<10} {num_entries:<10} {fprint_len:<10} {med_num_fps:<10} {max_num_fps:<10} {num_bad_trials:<10} {mean_fpr:<10.4f} {std_fpr:<10.4f} {worst_fpr:<10.4f} {bound:<10.4f}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. False Positive Rate Bound for varying fingerprint length\n", + "We should see similar behaviour to the above but across all fingerprint lengths." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>NumSlots</th>\n", + " <th>NumInput</th>\n", + " <th>FPrintLen</th>\n", + " <th>NumEntries</th>\n", + " <th>NumFalsePositives</th>\n", + " <th>trialFPR</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>2097152</td>\n", + " <td>1048576</td>\n", + " <td>5</td>\n", + " <td>1040474</td>\n", + " <td>72</td>\n", + " <td>0.017578</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>2097152</td>\n", + " <td>1098167</td>\n", + " <td>5</td>\n", + " <td>1089276</td>\n", + " <td>77</td>\n", + " <td>0.018799</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2097152</td>\n", + " <td>1150104</td>\n", + " <td>5</td>\n", + " <td>1140311</td>\n", + " <td>79</td>\n", + " <td>0.019287</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>2097152</td>\n", + " <td>1204498</td>\n", + " <td>5</td>\n", + " <td>1193814</td>\n", + " <td>83</td>\n", + " <td>0.020264</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>2097152</td>\n", + " <td>1261463</td>\n", + " <td>5</td>\n", + " <td>1249723</td>\n", + " <td>86</td>\n", + " <td>0.020996</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " NumSlots NumInput FPrintLen NumEntries NumFalsePositives trialFPR\n", + "0 2097152 1048576 5 1040474 72 0.017578\n", + "1 2097152 1098167 5 1089276 77 0.018799\n", + "2 2097152 1150104 5 1140311 79 0.019287\n", + "3 2097152 1204498 5 1193814 83 0.020264\n", + "4 2097152 1261463 5 1249723 86 0.020996" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "multi_fingerprint_df, multi_fp_num_queries = parse_results(\"QuotientFilterExpansionProfile20240814_034519PST.txt\")\n", + "\n", + "\n", + "multi_input_values = multi_fingerprint_df[\"NumInput\"].unique()\n", + "multi_num_slots = multi_fingerprint_df[\"NumSlots\"].unique()\n", + "\n", + "multi_fingerprint_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def get_fingerprint_lengths(df:pd.DataFrame) -> np.ndarray:\n", + " \"\"\"\n", + " Get the fingerprint lengths in the given DataFrame.\n", + "\n", + " Parameters:\n", + " - df (pd.DataFrame): The DataFrame containing the fingerprint lengths.\n", + "\n", + " Returns:\n", + " - np.ndarray: An array of the fingerprint lengths.\n", + " \"\"\"\n", + " num_input_fprint_len_arr = []\n", + " for g in df.groupby(\"NumInput\"):\n", + " num_input_fprint_len_arr.append([g[0], g[1][\"FPrintLen\"].values[0]])\n", + " return np.array(num_input_fprint_len_arr)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "30\n" + ] + } + ], + "source": [ + "unique_triples = multi_fingerprint_df[[\"NumSlots\", \"NumInput\",\t\"FPrintLen\"]].drop_duplicates()\n", + "print(len(unique_triples))" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "multi_exact_probs = np.zeros((len(unique_triples),), dtype=float)\n", + "for i, triple in enumerate(unique_triples.values):\n", + " multi_exact_probs[i] = qf_probability_sum(*triple)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<matplotlib.legend.Legend at 0x16467e490>" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABH4AAAGjCAYAAABaLdJPAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAAD9VElEQVR4nOzdd3xV5f3A8c85dyU3e5CEBEhYIkOWgIIiuPeqbdXWn1gVxWq1WgdaBQQs7mq1ltaBtmqrddVa9wAHQ2XvHQgkkEH2uuM8vz9OcrPuDQkk54bwffu63nuec+4533u563zzPN9HU0ophBBCCCGEEEIIIUS3o4c7ACGEEEIIIYQQQgjROSTxI4QQQgghhBBCCNFNSeJHCCGEEEIIIYQQopuSxI8QQgghhBBCCCFENyWJHyGEEEIIIYQQQohuShI/QgghhBBCCCGEEN2UJH6EEEIIIYQQQgghuilJ/Agh [...] + "text/plain": [ + "<Figure size 1400x400 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(14, 4))\n", + "\n", + "multi_mean_num_entries_fpr = get_mean_entries_fpr(multi_fingerprint_df)\n", + "\n", + "for i, g in enumerate(multi_fingerprint_df.groupby(\"NumInput\")):\n", + " group = g[0]\n", + " group_data = g[1]\n", + " fprint_len = group_data[\"FPrintLen\"].values[0]\n", + " naive_bound = 2.**(-fprint_len)\n", + "\n", + " colour = \"C\" + str(group_data[\"FPrintLen\"].values[0] -3)\n", + " ax.scatter(group_data[\"NumEntries\"], group_data[\"NumFalsePositives\"], \n", + " color=colour, marker=\"o\", s=10, alpha=0.01)\n", + " \n", + "\n", + "# bound function -- plotted at the input values points because the absolute load is a random variable that deviates \n", + "# from a deterministic input cardinality.\n", + "inputs_and_fprint_lens = get_fingerprint_lengths(multi_fingerprint_df)\n", + "bound_fct = 2.**(-inputs_and_fprint_lens[:, 1])\n", + "ax.plot(inputs_and_fprint_lens[:, 0], bound_fct*multi_fp_num_queries, \n", + " color=\"red\", marker=\"x\", label=r\"$2^{-f}\\cdot |\\{QuerySet\\}|$\", linestyle=\"--\")\n", + "ax.plot(multi_input_values, (multi_mean_num_entries_fpr[:,1]/multi_mean_num_entries_fpr[:,0])*bound_fct*multi_fp_num_queries, \n", + " label=r\"$\\alpha 2^{-f} \\cdot |\\{QuerySet\\}|$\", color=\"black\", marker=\"x\", linestyle=\"--\")\n", + "\n", + "\n", + "# Mean operator is linear in this setting\n", + "ax.plot(multi_mean_num_entries_fpr[:,1], multi_mean_num_entries_fpr[:,2]*multi_fp_num_queries, label=r\"$Mean(FPR)\\cdot |\\{QuerySet\\}|$\")\n", + "\n", + "# Exact probability sum\n", + "ax.plot(multi_input_values, multi_exact_probs*num_queries, label=\"Exact Probability Sum\", color=\"magenta\", marker=\"*\", linestyle=\"--\")\n", + "\n", + "# tidy the axes\n", + "ax.set_ylabel(\"Number of FPs\")\n", + "ax.set_xlabel(\"Number of input items: $n$\")\n", + "ax.set_yscale(\"log\", base=2)\n", + "ax.set_xscale(\"log\", base=2)\n", + "ax.grid()\n", + "ax.legend(loc=\"upper center\", bbox_to_anchor=(0.5, 1.15), ncol=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<matplotlib.legend.Legend at 0x14ff44f10>" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABIYAAAGjCAYAAABZth4iAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAADwkklEQVR4nOzdd3xb5dXA8d+9GrblvVccxxmE7ARCIKywoYGyRyktgZZRoKWUTUtCSKCBQul8gRIKlFJGW0qhQCmhJYEwQxYJ2cOOE8fx3rbGvc/7x5VkO5ZMnNhXGefLx0j30dXVkRTL0tF5zqMppRRCCCGEEEIIIYQQ4pCjxzoAIYQQQgghhBBCCBEbkhgSQgghhBBCCCGEOERJYkgIIYQQQgghhBDiECWJISGEEEIIIYQQQohDlCSGhBBCCCGEEEIIIQ5RkhgSQgghhBBCCCGEOERJYkgIIYQQQgghhBDiECWJ [...] + "text/plain": [ + "<Figure size 1400x400 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(14, 4))\n", + "\n", + "multi_mean_num_entries_fpr = get_mean_entries_fpr(multi_fingerprint_df)\n", + "\n", + "for i, g in enumerate(multi_fingerprint_df.groupby(\"NumInput\")):\n", + " group = g[0]\n", + " group_data = g[1]\n", + " fprint_len = group_data[\"FPrintLen\"].values[0]\n", + " naive_bound = 2.**(-fprint_len)\n", + "\n", + " colour = \"C\" + str(group_data[\"FPrintLen\"].values[0] -3)\n", + " ax.scatter(group_data[\"NumEntries\"], group_data[\"trialFPR\"], \n", + " color=colour, marker=\"o\", s=10, alpha=0.01)\n", + " \n", + "\n", + "# bound function -- plotted at the input values points because the absolute load is a random variable that deviates \n", + "# from a deterministic input cardinality.\n", + "inputs_and_fprint_lens = get_fingerprint_lengths(multi_fingerprint_df)\n", + "bound_fct = 2.**(-inputs_and_fprint_lens[:, 1])\n", + "ax.plot(inputs_and_fprint_lens[:, 0], bound_fct, \n", + " color=\"red\", marker=\"x\", label=r\"$2^{-f}$\", linestyle=\"--\")\n", + "ax.plot(multi_input_values, (multi_mean_num_entries_fpr[:,1]/multi_mean_num_entries_fpr[:,0])*bound_fct, \n", + " label=r\"$\\alpha 2^{-f}$\", color=\"black\", marker=\"x\", linestyle=\"--\")\n", + "\n", + "\n", + "# Mean operator is linear in this setting\n", + "ax.plot(multi_mean_num_entries_fpr[:,1], multi_mean_num_entries_fpr[:,2], label=r\"$Mean(FPR)$\")\n", + "\n", + "# Exact probability sum\n", + "ax.plot(multi_input_values, multi_exact_probs, label=\"Exact Probability Sum\", color=\"magenta\", marker=\"*\", linestyle=\"--\")\n", + "\n", + "# tidy the axes\n", + "ax.set_ylabel(\"FPR\")\n", + "ax.set_xlabel(\"Number of input items: $n$\")\n", + "ax.set_yscale(\"log\", base=2)\n", + "ax.set_xscale(\"log\", base=2)\n", + "ax.grid()\n", + "ax.legend(loc=\"upper center\", bbox_to_anchor=(0.5, 1.15), ncol=4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see the same behaviour as for the single fingerprint length even when the filter expands.\n", + "Different colours represent different fingerprint lengths. The three groups are $5, 4, 3$ from left to right in green, orange, and blue." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/notebooks/FilterErrorProfile.ipynb b/src/notebooks/FilterErrorProfile.ipynb new file mode 100644 index 0000000..f359b3e --- /dev/null +++ b/src/notebooks/FilterErrorProfile.ipynb @@ -0,0 +1,1242 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "df8bb271", + "metadata": {}, + "source": [ + "# Error Profile: varying the signature length.\n", + "\n", + "### 1. Introduction\n", + "\n", + "This notebooks compares the **error** behaviour of the Bloom and Quotient Filter as the number of hash bits for each filter is varied.\n", + "Both filters are designed for _approximate set membership_ queries and have the property that, upon querying for the membership of an element:\n", + "1. Quering any item inserted into the sketch will always return `true` and **never** return `false` -- _\"no false negative property\"._\n", + "2. Some items that were not in the set used to build the filter **may** return `true` -- this is known as a _false positive_.\n", + "\n", + "The **false positive probability (fpr)** is the probability that the filter returns `true` to any query item not in the set used to build the filter. We test this by building a filter $S$ over a fixed input cardinality $n$ and then generating a query set $X$ that has empty intersection with the input set. For every item in $X$, we count the number of times that the filter returns `true` to the approximate membership query and divide this by $|X|$. Over large enough sets $X$ and mu [...] + "For each filter, a signature is generated and the number of hash bits is the length of the signature.\n", + "The experiment is repeated over different signature lengths and the performance is compared to the theoretically expected curves.\n", + "\n", + "## 2. Types of Filter\n", + "\n", + "### 2.1 Bloom Filter ###\n", + "\n", + "A **Bloom Filter** is an array of $m$ bits. The array is populated using $k$ independent hash functions and the signature is the tuple of index locations in the array \n", + "$h(x) = (h_1(x), h_2(x), \\dots, h_k(x))$ with each $h_j(x) \\in \\{0, 1, 2, \\dots, m-1 \\}$.\n", + "Bloom filter accepts $n < m$ items with the specific $n$ coupled to both the length $m$ and the number of hash functions, $k$.\n", + "The Bloom filter fpr is approximately $\\epsilon = (1 - e^{-\\frac{nk}{m}})^k$ which is minimised with the choice of $k = \\ln(2) \\frac{m}{n}$.\n", + "\n", + "An optimally initialised Bloom filter has $k = \\frac{m}{n} \\ln(2)$ so that the **space** in bits is $m = \\frac{k}{\\ln(2) n}$ and the **false positive probability** is approximately:\n", + "\\begin{align}\n", + "\\epsilon &\\approx (1 - e^{\\frac{n}{m}k})^k \\\\ \n", + "&=(1 - e^{\\frac{n}{m}\\frac{m}{n} \\ln(2)})^{\\frac{m}{n} \\ln(2)} \\\\ \n", + "&= 2^{-\\frac{m}{n} \\ln(2)} \\tag{*} \\\\ \n", + "&= 2^{-k}.\n", + "\\end{align}\n", + "\n", + "### 2.2 Quotient Filter ###\n", + "\n", + "A **Quotient Filter** is an array of $k=2^Q$ slots, with each slot storing $b = 3 + f$ bits in total: $3$ bits are used for state and\n", + "a further $f$ bits are used for a fingerprint that partially identifies the item in a given slot.\n", + "We think of the fingerprint length, $f$, as being the signature length as it is this portion of a hash function output that \n", + "is related to the accuracy of a generated filter.\n", + "To generate a fingerprint, we take a hash function $h(x)$ and use $Q$ of the hash bits to for the slot selection and $f$ of them for the fingerprint \n", + "selection. \n", + "Any further hash bits may be discarded and the total *space** usage in bits is $m = 2^Q(3+f)$.\n", + "\n", + "The $Q$ slot bits are implicitly stored via the array index and any items that land in the same slot but have distinct fingerprints have their \n", + "collision resolved through the three metadata state bits.\n", + "The $Q$ bits used for the address space represent a value for each item that refers to its _canonical slot_ -- the location an item would be stored were \n", + "there no hash collisions.\n", + "Collisions are resolved similarly to linear probing with the combination of state and fingerprint bits being used to disambiguate items.\n", + "The Quotient Filter can, conceptually, accept up to $n = 2^Q$ items, however, this would cause complete saturation of the filter so isn't practical. \n", + "Rather, a **load factor**, $\\alpha$, threshold between zero and one is chosen and only this fraction of the slots are used which acts as a bound on the maximum number of items that can be inserted into the sketch so that $\\alpha \\le \\frac{n}{k}$.\n", + "\n", + "A false positive is obtained if an unseen element hashes to a slot that is canonical for a previously seen element and has a matching fingerprint in the correct location.\n", + "The probability of the former event, assuming that $n$ items have been inserted, is at most $\\alpha$ while the probability for the latter\n", + "The false positive rate using $r$ bit fingerprints is approximately:\n", + "\\begin{align*}\n", + "\\epsilon &\\approx \\alpha 2^{-r}.\n", + "\\end{align*}\n", + "\n", + "## 2. Results\n", + "\n", + "First we show the theoretical type of tradeoff that we hope to find in the characterisation profiles.\n", + "For both filters, we expect the false positive rate to exponentially decrease with the number of hash bits used." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b9634ba5-37f9-47fd-ab24-eeb36e427cf3", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd \n", + "import matplotlib.pyplot as plt\n", + "from matplotlib.ticker import AutoMinorLocator\n", + "from typing import Tuple\n", + "%matplotlib inline\n", + "plt.rcParams['font.size'] = 14" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5bc3ab73", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'False positive rate vs. Number of hash bits')" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABAYAAALLCAYAAABq/ICJAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/SrBM8AAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd1QU1/v48fcsLL1YAFEQsKDSLAgYK5ZEjSZqbKCJRk3s6ZrmJ9ZoLFFjejOJJSqaWBJboolRY4oiShELiCKIHSkiUnd+f/hlfxLaroJYntc5nKMzd+48d/buwjx7515FVVUVIYQQQgghhBBCPJQ01R2AEEIIIYQQQgghqo8kBoQQQgghhBBCiIeYJAaEEEIIIYQQQoiHmCQGhBBCCCGEEEKIh5gkBoQQQgghhBBCiIeYJAaEEEIIIYQQQoiHmCQGhBBCCCGEEEKIh5gkBoQQQgghhBBCiIeYJAaE [...] + "text/plain": [ + "<Figure size 1200x800 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "num_hash_bits = np.arange(1,33)\n", + "\n", + "\n", + "fig, ax = plt.subplots(figsize=(12, 8))\n", + "ax.plot(num_hash_bits, 2.**(-num_hash_bits), label=\"Optimal Bloom Filter\", color=\"blue\") \n", + "\n", + "colours = [\"silver\", \"grey\", \"black\"]\n", + "for i, alpha in enumerate([0.5, 0.8, 0.9]):\n", + " theory_str = \"$\\epsilon_{{}} = {} \\cdot 2^{{-f}}$\".format(alpha, alpha)\n", + " ax.plot(num_hash_bits, alpha*2.**(-num_hash_bits), label=theory_str, color=colours[i])\n", + "ax.grid()\n", + "ax.set_yscale(\"log\")\n", + "ax.legend()\n", + "ax.set_xlabel(r\"Number of hash bits\")\n", + "ax.set_ylabel(\"False Positive Rate\")\n", + "ax.set_title(\"False positive rate vs. Number of hash bits\")\n", + "#fig.savefig(\"false-positive-rates.pdf\")" + ] + }, + { + "cell_type": "markdown", + "id": "777bb727", + "metadata": {}, + "source": [ + "Note that for a _fixed_ bitmap length and input size, simply increasing the number of hash functions in the Bloom Filter does not improve performance.\n", + "When the number of hash functions increases, there is competing behaviour: on the one hand, more bits are set which can increase the number of false positives reported.\n", + "On the other hand, a false positive is triggered only when all of the bit positions are set, so using more hash functions means that the requirement is higher.\n", + "In contrast, because the false positive probability of a Quotient Filter is only related to the fingerprint length (rather than also the number of items inserted), increasing\n", + "the length of the fingerprint yields a monotonically decreasing false positive rate.\n", + "We can see this feature in the following plot." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "636b5abf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(5e-06, 2.3405078559743058)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABAYAAAKxCAYAAADARa4uAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/SrBM8AAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeXyM1/7A8c/s2XdbLBGRxE4V1aoQWkpbdLO1LrqpanXR9raKJGh7Xa5fW3F7Ve+tXbWl9F4UVVE7tVeQILFvScieWc/vj8mMjJlEVhHO22teyTzPmec5z3gy85zvc873KIQQAkmSJEmSJEmSJEmS7knK6q6AJEmSJEmSJEmSJEnVRwYGJEmSJEmSJEmSJOkeJgMDkiRJkiRJkiRJknQPk4EBSZIkSZIkSZIkSbqHycCAJEmSJEmSJEmSJN3DZGBAkiRJkiRJkiRJku5hMjAgSZIkSZIkSZIkSfcw [...] + "text/plain": [ + "<Figure size 1200x800 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ns = [1.5E8]\n", + "ms = [4, 8, 12, 16, 24]\n", + "ks = np.arange(1, 26)\n", + "\n", + "\n", + "fig, ax = plt.subplots(figsize=(12, 8))\n", + "\n", + "\n", + "# bloom plots \n", + "bf_lines = []\n", + "for n in ns:\n", + " for r in ms:\n", + " m = r*n\n", + " bf_l, = ax.plot(ks, (1. - np.exp(-ks*n / m))**ks, label=f\"({n:.1e}, {m:.1e})\")\n", + " bf_lines.append(bf_l)\n", + "opt_bf_l, = ax.plot(ks, 2.**(-ks), label=r\"${fpr} = 2^{-f}$\", color=\"magenta\", linewidth=2)\n", + "bf_lines.append(opt_bf_l)\n", + "bf_legend = ax.legend(title=\"Bloom Filter: (n, m)\", handles=bf_lines, loc=\"upper left\")\n", + "ax.add_artist(bf_legend)\n", + "\n", + "# quotient plots\n", + "qf_lines = []\n", + "linestyles = [\":\", \"-.\", \"--\"]\n", + "for ii, alpha in enumerate([0.5, 0.75, 0.9]):\n", + " qf_l, = ax.plot(ks, alpha*2.**(-ks), label=f\"${alpha:.2f} \\cdot 2^{{-f}}$\", linestyle=linestyles[ii], color=\"black\")\n", + " qf_lines.append(qf_l)\n", + "\n", + "qf_legend = ax.legend(handles=qf_lines, title=\"Quotient filter FPR\", loc=\"upper right\")\n", + "\n", + "ax.set_xlabel(r\"Number of hash bits: $h$\")\n", + "ax.set_ylabel(\"False Positive Rate\")\n", + "ax.set_yscale(\"log\")\n", + "ax.grid()\n", + "\n", + "ax.set_ylim(bottom=5e-6)\n", + "#fig.savefig(\"image/fpr-vs-numh.pdf\")" + ] + }, + { + "cell_type": "markdown", + "id": "06245ebd", + "metadata": {}, + "source": [ + "Depending on the parameter setting, miscalibrating the Bloom filter parameters can lead to sharper changes in error performance." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "60b8bae1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<matplotlib.legend.Legend at 0x12f181410>" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABAMAAAKxCAYAAAAmbGVqAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/SrBM8AAAACXBIWXMAAA9hAAAPYQGoP6dpAACtxElEQVR4nOzdeVxU9f7H8fcZdmRTwQ0VxL0yNQ3NNTOXzNJWzbJd27s3ra52K6tfZZa35bbd0m5qaWlZWrfMLffU3DVLxQUEXEEEZR2Y+f0BTBIuMAycGXg9Hw8fNeecOXyGL5Tznu/38zXsdrtdAAAAAACgxrCYXQAAAAAAAKhahAEAAAAAANQwhAEAAAAAANQwhAEAAAAAANQwhAEAAAAAANQwhAEAAAAAANQwhAEAAAAAANQwhAEAAAAAANQw3mYXUF3ZbDYdOnRIwcHBMgzD7HIAAAAAANWc3W7XqVOn1KhR [...] + "text/plain": [ + "<Figure size 1200x800 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "n = int(0.9*(1<<25)) # chosen for illustration\n", + "m = 1<<28\n", + "ks = np.arange(1, 17)\n", + "fig, ax = plt.subplots(figsize=(12, 8))\n", + "ax.plot(ks, (1. - np.exp(-ks*n / m))**ks, label=\"Bloom Filter FPR\", marker=\"*\")\n", + "ax.set_xlabel(\"Number of Hash Functions\")\n", + "ax.set_ylabel(\"False Positive Rate\")\n", + "ax.grid()\n", + "ax.legend()" + ] + }, + { + "cell_type": "markdown", + "id": "ba3dd229", + "metadata": {}, + "source": [ + "### Experiment 1. Validating the theoretical performance.\n", + "\n", + "In this plot we demonstrate that the expected behaviour above is borne out in practice.\n", + "\n", + "The experimental configuration can be found at the bottom of the input result file and in this case is as follows.\n", + "Set the input cardinality to $n = 0.8 * 2^20$; vary the number of hash bits from a lower to an upper bound and use the number of hash bits to determine the number of trials at each hash bit value; \n", + "for the Bloom filter, use these parameters to determine an optimal filter and for the Quotient filter just use the number of hash bits as the fingerprint length." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "2012f2a5", + "metadata": {}, + "outputs": [], + "source": [ + "def bloom_filter_fpr(n:int, k:int) -> float:\n", + " \"\"\"Returns the theoretical false positive rate of a Bloom filter with n elements, and k hash functions.\"\"\"\n", + " m = k * n\n", + " m /= np.log(2.)\n", + " m = int(m)\n", + " return (1 - (1 - 1/m)**(k*n))**k\n", + "\n", + "def quotient_filter_fpr(alpha: float, bits_per_entry: int) -> float:\n", + " \"\"\"\n", + " Returns the theoretical false positive rate of a Quotient filter with alpha load factor that \n", + " uses `bits_per_entry` many bits for each entry added to the sketch.\n", + " \"\"\"\n", + " return 2**(3 - alpha*bits_per_entry)\n", + "\n", + "from scipy.stats import binom as binom_dist\n", + "def probability_sum(num_slots:int, num_elements:int, fingerprint_length:int) -> float:\n", + " \"\"\"\n", + " The vectorized part of this code more efficiently evaluates \n", + " for k in range(m+1):\n", + " probs[k] = dist.pmf(k)\n", + " probs[k] *= (1. - (1. - 2.**(-f))**k)\n", + "\n", + " This sum is well approximated by alpha*2^(-f) for alpha = n / m\n", + " \"\"\"\n", + " m = num_slots\n", + " n = num_elements\n", + " f = fingerprint_length\n", + " assert isinstance(m, int), \"num_slots must be an integer\"\n", + " assert isinstance(n, int), \"num_elements must be an integer\"\n", + " assert isinstance(f, int), \"fingerprint_length must be an integer\"\n", + " dist = binom_dist(n, 1./m)\n", + " probs = dist.pmf(np.arange(m+1))\n", + " probs *= (1. - (1. - 2.**(-f))**np.arange(m+1))\n", + " return np.sum(probs)\n", + "\n", + "n = int(0.9 * 2**20)\n", + "theoretical_bloom_results = [] \n", + "theoretical_quotient_results = []\n", + "num_test_bits = np.arange(4,25)\n", + "\n", + "for k in num_test_bits:\n", + " theoretical_bloom_results.append([k, k/np.log(2), bloom_filter_fpr(n, k)])\n", + " theoretical_quotient_results.append([k, k, quotient_filter_fpr(0.9, k)])\n", + "\n", + "\n", + "bloom_theory_accuracy = pd.DataFrame(theoretical_bloom_results, columns=['k', 'bitsPerEntry', 'FPR'])\n", + "quotient_theory_accuracy = pd.DataFrame(theoretical_quotient_results, columns=['k', 'bitsPerEntry', 'FPR'])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "71f122df", + "metadata": {}, + "outputs": [], + "source": [ + "def parse_accuracy_results(filename:str) -> Tuple[pd.DataFrame, float, int]:\n", + " \"\"\"\n", + " Takes an input file name in .txt format, strips out the extra strings and returns \n", + " the main results in a dataframe.\n", + "\n", + " The universe size is the largest power of 2 that is used to insert into the sketch and \n", + " the capacity is how far along the power of two interval used for maximum cardinality.\n", + "\n", + " Eg. capacity = 0.9 and lgU = 20 means that we insert 0.9 * 2^20 elements into the sketch.\n", + " \"\"\"\n", + " with open(filename) as f:\n", + " lines = f.readlines()[2:]\n", + " ignore_from_idx = lines.index(\"PROPERTIES:\\n\")\n", + " results = lines[:ignore_from_idx-1] # there is an empty line preceding the ignore_from_idx\n", + " results_clean = [r.strip(\"\\n\") for r in results]\n", + " results_df = pd.DataFrame(sub.split(\"\\t\") for sub in results_clean[1:])\n", + " results_df.columns = results_clean[0].split(\"\\t\")\n", + " for col in results_df.columns:\n", + " results_df[col] = pd.to_numeric(results_df[col])\n", + "\n", + " # get the universe size and capacity \n", + " properties = lines[ignore_from_idx:]\n", + " properties_clean = [p.strip(\"\\n\") for p in properties]\n", + " strings_to_find = ['Universe_capacity', 'Universe_lgU']\n", + " found_strings = [s for s in properties_clean if any(substring in s for substring in strings_to_find)]\n", + " values = [float(s.split('=')[1]) for s in found_strings]\n", + " capacity = values[0]\n", + " lgU = int(values[1])\n", + " return results_df, capacity, lgU" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "dfc34112", + "metadata": {}, + "outputs": [], + "source": [ + "bloom_results_df, bloom_thld, bloom_lgU = parse_accuracy_results(\"BloomFilterAccuracyProfile20240612_050335PST.txt\")\n", + "quotient_results_df, q90, _ = parse_accuracy_results(\"QuotientFilterAccuracyProfile20240617_065441PST.txt\")\n", + "quotient_results_df_75, q75, _ = parse_accuracy_results(\"QuotientFilterAccuracyProfile20240612_062110PST.txt\")\n", + "m = 2**20\n", + "q_alpha = 0.9\n", + "q_n = int(q_alpha*m)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "3c28b6f5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "20" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bloom_lgU" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "080f7541", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>numHashes</th>\n", + " <th>bitsPerEntry_bloom</th>\n", + " <th>FPR_bloom</th>\n", + " <th>filterSizeBits_bloom</th>\n", + " <th>bitsPerEntry_quotient</th>\n", + " <th>FPR_quotient</th>\n", + " <th>filterSizeBits_quotient</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>10</td>\n", + " <td>14</td>\n", + " <td>1.016590e-03</td>\n", + " <td>13614976</td>\n", + " <td>10</td>\n", + " <td>7.020040e-03</td>\n", + " <td>10485760</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>11</td>\n", + " <td>15</td>\n", + " <td>4.788000e-04</td>\n", + " <td>14976512</td>\n", + " <td>11</td>\n", + " <td>3.261530e-03</td>\n", + " <td>11534336</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>12</td>\n", + " <td>17</td>\n", + " <td>2.345400e-04</td>\n", + " <td>16337984</td>\n", + " <td>12</td>\n", + " <td>1.752870e-03</td>\n", + " <td>12582912</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>13</td>\n", + " <td>18</td>\n", + " <td>1.299970e-04</td>\n", + " <td>17699520</td>\n", + " <td>13</td>\n", + " <td>8.584560e-04</td>\n", + " <td>13631488</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>14</td>\n", + " <td>20</td>\n", + " <td>5.921320e-05</td>\n", + " <td>19060992</td>\n", + " <td>14</td>\n", + " <td>4.600410e-04</td>\n", + " <td>14680064</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>15</td>\n", + " <td>21</td>\n", + " <td>2.975460e-05</td>\n", + " <td>20422464</td>\n", + " <td>15</td>\n", + " <td>2.182010e-04</td>\n", + " <td>15728640</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>16</td>\n", + " <td>23</td>\n", + " <td>1.885760e-05</td>\n", + " <td>21784000</td>\n", + " <td>16</td>\n", + " <td>1.101220e-04</td>\n", + " <td>16777216</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>17</td>\n", + " <td>24</td>\n", + " <td>7.708870e-06</td>\n", + " <td>23145472</td>\n", + " <td>17</td>\n", + " <td>5.435940e-05</td>\n", + " <td>17825792</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>18</td>\n", + " <td>25</td>\n", + " <td>3.326770e-06</td>\n", + " <td>24507008</td>\n", + " <td>18</td>\n", + " <td>2.869890e-05</td>\n", + " <td>18874368</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>19</td>\n", + " <td>27</td>\n", + " <td>2.005160e-06</td>\n", + " <td>25868480</td>\n", + " <td>19</td>\n", + " <td>1.393830e-05</td>\n", + " <td>19922944</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>20</td>\n", + " <td>28</td>\n", + " <td>8.742010e-07</td>\n", + " <td>27229952</td>\n", + " <td>20</td>\n", + " <td>6.834670e-06</td>\n", + " <td>20971520</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>21</td>\n", + " <td>30</td>\n", + " <td>5.129610e-07</td>\n", + " <td>28591488</td>\n", + " <td>21</td>\n", + " <td>3.294510e-06</td>\n", + " <td>22020096</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>22</td>\n", + " <td>31</td>\n", + " <td>2.861020e-07</td>\n", + " <td>29952960</td>\n", + " <td>22</td>\n", + " <td>1.688800e-06</td>\n", + " <td>23068672</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>23</td>\n", + " <td>33</td>\n", + " <td>1.490120e-07</td>\n", + " <td>31314496</td>\n", + " <td>23</td>\n", + " <td>8.621390e-07</td>\n", + " <td>24117248</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>24</td>\n", + " <td>34</td>\n", + " <td>5.501970e-08</td>\n", + " <td>32675968</td>\n", + " <td>24</td>\n", + " <td>4.779830e-07</td>\n", + " <td>25165824</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " numHashes bitsPerEntry_bloom FPR_bloom filterSizeBits_bloom \\\n", + "6 10 14 1.016590e-03 13614976 \n", + "7 11 15 4.788000e-04 14976512 \n", + "8 12 17 2.345400e-04 16337984 \n", + "9 13 18 1.299970e-04 17699520 \n", + "10 14 20 5.921320e-05 19060992 \n", + "11 15 21 2.975460e-05 20422464 \n", + "12 16 23 1.885760e-05 21784000 \n", + "13 17 24 7.708870e-06 23145472 \n", + "14 18 25 3.326770e-06 24507008 \n", + "15 19 27 2.005160e-06 25868480 \n", + "16 20 28 8.742010e-07 27229952 \n", + "17 21 30 5.129610e-07 28591488 \n", + "18 22 31 2.861020e-07 29952960 \n", + "19 23 33 1.490120e-07 31314496 \n", + "20 24 34 5.501970e-08 32675968 \n", + "\n", + " bitsPerEntry_quotient FPR_quotient filterSizeBits_quotient \n", + "6 10 7.020040e-03 10485760 \n", + "7 11 3.261530e-03 11534336 \n", + "8 12 1.752870e-03 12582912 \n", + "9 13 8.584560e-04 13631488 \n", + "10 14 4.600410e-04 14680064 \n", + "11 15 2.182010e-04 15728640 \n", + "12 16 1.101220e-04 16777216 \n", + "13 17 5.435940e-05 17825792 \n", + "14 18 2.869890e-05 18874368 \n", + "15 19 1.393830e-05 19922944 \n", + "16 20 6.834670e-06 20971520 \n", + "17 21 3.294510e-06 22020096 \n", + "18 22 1.688800e-06 23068672 \n", + "19 23 8.621390e-07 24117248 \n", + "20 24 4.779830e-07 25165824 " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "combined_df = pd.merge(bloom_results_df, quotient_results_df, on=\"numHashes\", suffixes=(\"_bloom\", \"_quotient\"))\n", + "combined_df.drop([\"numQueryPoints_bloom\", \"numQueryPoints_quotient\", \"numTrials_bloom\", \"numTrials_quotient\"], axis=1, inplace=True)\n", + "combined_df.tail(15)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "3fc0d0fc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>numHashes</th>\n", + " <th>bitsPerEntry</th>\n", + " <th>FPR</th>\n", + " <th>filterSizeBits</th>\n", + " <th>numQueryPoints</th>\n", + " <th>numTrials</th>\n", + " <th>fingerprintLength</th>\n", + " <th>theoreticalFPR</th>\n", + " <th>scaledFPR</th>\n", + " <th>exactCalcFPR</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>4</td>\n", + " <td>4</td>\n", + " <td>3.590670e-01</td>\n", + " <td>4194304</td>\n", + " <td>32</td>\n", + " <td>608</td>\n", + " <td>1</td>\n", + " <td>5.000000e-01</td>\n", + " <td>4.500000e-01</td>\n", + " <td>3.623718e-01</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>5</td>\n", + " <td>5</td>\n", + " <td>2.019490e-01</td>\n", + " <td>5242880</td>\n", + " <td>64</td>\n", + " <td>412</td>\n", + " <td>2</td>\n", + " <td>2.500000e-01</td>\n", + " <td>2.250000e-01</td>\n", + " <td>2.014837e-01</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>6</td>\n", + " <td>6</td>\n", + " <td>1.035220e-01</td>\n", + " <td>6291456</td>\n", + " <td>128</td>\n", + " <td>299</td>\n", + " <td>3</td>\n", + " <td>1.250000e-01</td>\n", + " <td>1.125000e-01</td>\n", + " <td>1.064026e-01</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>7</td>\n", + " <td>7</td>\n", + " <td>5.340250e-02</td>\n", + " <td>7340032</td>\n", + " <td>256</td>\n", + " <td>228</td>\n", + " <td>4</td>\n", + " <td>6.250000e-02</td>\n", + " <td>5.625000e-02</td>\n", + " <td>5.469720e-02</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>8</td>\n", + " <td>8</td>\n", + " <td>2.748400e-02</td>\n", + " <td>8388608</td>\n", + " <td>512</td>\n", + " <td>181</td>\n", + " <td>5</td>\n", + " <td>3.125000e-02</td>\n", + " <td>2.812500e-02</td>\n", + " <td>2.773316e-02</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>9</td>\n", + " <td>9</td>\n", + " <td>1.373830e-02</td>\n", + " <td>9437184</td>\n", + " <td>1024</td>\n", + " <td>147</td>\n", + " <td>6</td>\n", + " <td>1.562500e-02</td>\n", + " <td>1.406250e-02</td>\n", + " <td>1.396408e-02</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>10</td>\n", + " <td>10</td>\n", + " <td>7.020040e-03</td>\n", + " <td>10485760</td>\n", + " <td>2048</td>\n", + " <td>122</td>\n", + " <td>7</td>\n", + " <td>7.812500e-03</td>\n", + " <td>7.031250e-03</td>\n", + " <td>7.006586e-03</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>11</td>\n", + " <td>11</td>\n", + " <td>3.261530e-03</td>\n", + " <td>11534336</td>\n", + " <td>4096</td>\n", + " <td>103</td>\n", + " <td>8</td>\n", + " <td>3.906250e-03</td>\n", + " <td>3.515625e-03</td>\n", + " <td>3.509451e-03</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>12</td>\n", + " <td>12</td>\n", + " <td>1.752870e-03</td>\n", + " <td>12582912</td>\n", + " <td>8192</td>\n", + " <td>89</td>\n", + " <td>9</td>\n", + " <td>1.953125e-03</td>\n", + " <td>1.757813e-03</td>\n", + " <td>1.756268e-03</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>13</td>\n", + " <td>13</td>\n", + " <td>8.584560e-04</td>\n", + " <td>13631488</td>\n", + " <td>16384</td>\n", + " <td>77</td>\n", + " <td>10</td>\n", + " <td>9.765625e-04</td>\n", + " <td>8.789063e-04</td>\n", + " <td>8.785198e-04</td>\n", + " </tr>\n", + " <tr>\n", + " <th>10</th>\n", + " <td>14</td>\n", + " <td>14</td>\n", + " <td>4.600410e-04</td>\n", + " <td>14680064</td>\n", + " <td>32768</td>\n", + " <td>67</td>\n", + " <td>11</td>\n", + " <td>4.882812e-04</td>\n", + " <td>4.394531e-04</td>\n", + " <td>4.393564e-04</td>\n", + " </tr>\n", + " <tr>\n", + " <th>11</th>\n", + " <td>15</td>\n", + " <td>15</td>\n", + " <td>2.182010e-04</td>\n", + " <td>15728640</td>\n", + " <td>65536</td>\n", + " <td>60</td>\n", + " <td>12</td>\n", + " <td>2.441406e-04</td>\n", + " <td>2.197266e-04</td>\n", + " <td>2.197023e-04</td>\n", + " </tr>\n", + " <tr>\n", + " <th>12</th>\n", + " <td>16</td>\n", + " <td>16</td>\n", + " <td>1.101220e-04</td>\n", + " <td>16777216</td>\n", + " <td>131072</td>\n", + " <td>53</td>\n", + " <td>13</td>\n", + " <td>1.220703e-04</td>\n", + " <td>1.098633e-04</td>\n", + " <td>1.098572e-04</td>\n", + " </tr>\n", + " <tr>\n", + " <th>13</th>\n", + " <td>17</td>\n", + " <td>17</td>\n", + " <td>5.435940e-05</td>\n", + " <td>17825792</td>\n", + " <td>262144</td>\n", + " <td>48</td>\n", + " <td>14</td>\n", + " <td>6.103516e-05</td>\n", + " <td>5.493164e-05</td>\n", + " <td>5.493011e-05</td>\n", + " </tr>\n", + " <tr>\n", + " <th>14</th>\n", + " <td>18</td>\n", + " <td>18</td>\n", + " <td>2.869890e-05</td>\n", + " <td>18874368</td>\n", + " <td>524288</td>\n", + " <td>43</td>\n", + " <td>15</td>\n", + " <td>3.051758e-05</td>\n", + " <td>2.746582e-05</td>\n", + " <td>2.746543e-05</td>\n", + " </tr>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>19</td>\n", + " <td>19</td>\n", + " <td>1.393830e-05</td>\n", + " <td>19922944</td>\n", + " <td>1048576</td>\n", + " <td>39</td>\n", + " <td>16</td>\n", + " <td>1.525879e-05</td>\n", + " <td>1.373291e-05</td>\n", + " <td>1.373281e-05</td>\n", + " </tr>\n", + " <tr>\n", + " <th>16</th>\n", + " <td>20</td>\n", + " <td>20</td>\n", + " <td>6.834670e-06</td>\n", + " <td>20971520</td>\n", + " <td>2097152</td>\n", + " <td>36</td>\n", + " <td>17</td>\n", + " <td>7.629395e-06</td>\n", + " <td>6.866455e-06</td>\n", + " <td>6.866429e-06</td>\n", + " </tr>\n", + " <tr>\n", + " <th>17</th>\n", + " <td>21</td>\n", + " <td>21</td>\n", + " <td>3.294510e-06</td>\n", + " <td>22020096</td>\n", + " <td>4194304</td>\n", + " <td>33</td>\n", + " <td>18</td>\n", + " <td>3.814697e-06</td>\n", + " <td>3.433228e-06</td>\n", + " <td>3.433220e-06</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18</th>\n", + " <td>22</td>\n", + " <td>22</td>\n", + " <td>1.688800e-06</td>\n", + " <td>23068672</td>\n", + " <td>8388608</td>\n", + " <td>30</td>\n", + " <td>19</td>\n", + " <td>1.907349e-06</td>\n", + " <td>1.716614e-06</td>\n", + " <td>1.716612e-06</td>\n", + " </tr>\n", + " <tr>\n", + " <th>19</th>\n", + " <td>23</td>\n", + " <td>23</td>\n", + " <td>8.621390e-07</td>\n", + " <td>24117248</td>\n", + " <td>16777216</td>\n", + " <td>28</td>\n", + " <td>20</td>\n", + " <td>9.536743e-07</td>\n", + " <td>8.583069e-07</td>\n", + " <td>8.583062e-07</td>\n", + " </tr>\n", + " <tr>\n", + " <th>20</th>\n", + " <td>24</td>\n", + " <td>24</td>\n", + " <td>4.779830e-07</td>\n", + " <td>25165824</td>\n", + " <td>33554432</td>\n", + " <td>26</td>\n", + " <td>21</td>\n", + " <td>4.768372e-07</td>\n", + " <td>4.291534e-07</td>\n", + " <td>4.291532e-07</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " numHashes bitsPerEntry FPR filterSizeBits numQueryPoints \\\n", + "0 4 4 3.590670e-01 4194304 32 \n", + "1 5 5 2.019490e-01 5242880 64 \n", + "2 6 6 1.035220e-01 6291456 128 \n", + "3 7 7 5.340250e-02 7340032 256 \n", + "4 8 8 2.748400e-02 8388608 512 \n", + "5 9 9 1.373830e-02 9437184 1024 \n", + "6 10 10 7.020040e-03 10485760 2048 \n", + "7 11 11 3.261530e-03 11534336 4096 \n", + "8 12 12 1.752870e-03 12582912 8192 \n", + "9 13 13 8.584560e-04 13631488 16384 \n", + "10 14 14 4.600410e-04 14680064 32768 \n", + "11 15 15 2.182010e-04 15728640 65536 \n", + "12 16 16 1.101220e-04 16777216 131072 \n", + "13 17 17 5.435940e-05 17825792 262144 \n", + "14 18 18 2.869890e-05 18874368 524288 \n", + "15 19 19 1.393830e-05 19922944 1048576 \n", + "16 20 20 6.834670e-06 20971520 2097152 \n", + "17 21 21 3.294510e-06 22020096 4194304 \n", + "18 22 22 1.688800e-06 23068672 8388608 \n", + "19 23 23 8.621390e-07 24117248 16777216 \n", + "20 24 24 4.779830e-07 25165824 33554432 \n", + "\n", + " numTrials fingerprintLength theoreticalFPR scaledFPR exactCalcFPR \n", + "0 608 1 5.000000e-01 4.500000e-01 3.623718e-01 \n", + "1 412 2 2.500000e-01 2.250000e-01 2.014837e-01 \n", + "2 299 3 1.250000e-01 1.125000e-01 1.064026e-01 \n", + "3 228 4 6.250000e-02 5.625000e-02 5.469720e-02 \n", + "4 181 5 3.125000e-02 2.812500e-02 2.773316e-02 \n", + "5 147 6 1.562500e-02 1.406250e-02 1.396408e-02 \n", + "6 122 7 7.812500e-03 7.031250e-03 7.006586e-03 \n", + "7 103 8 3.906250e-03 3.515625e-03 3.509451e-03 \n", + "8 89 9 1.953125e-03 1.757813e-03 1.756268e-03 \n", + "9 77 10 9.765625e-04 8.789063e-04 8.785198e-04 \n", + "10 67 11 4.882812e-04 4.394531e-04 4.393564e-04 \n", + "11 60 12 2.441406e-04 2.197266e-04 2.197023e-04 \n", + "12 53 13 1.220703e-04 1.098633e-04 1.098572e-04 \n", + "13 48 14 6.103516e-05 5.493164e-05 5.493011e-05 \n", + "14 43 15 3.051758e-05 2.746582e-05 2.746543e-05 \n", + "15 39 16 1.525879e-05 1.373291e-05 1.373281e-05 \n", + "16 36 17 7.629395e-06 6.866455e-06 6.866429e-06 \n", + "17 33 18 3.814697e-06 3.433228e-06 3.433220e-06 \n", + "18 30 19 1.907349e-06 1.716614e-06 1.716612e-06 \n", + "19 28 20 9.536743e-07 8.583069e-07 8.583062e-07 \n", + "20 26 21 4.768372e-07 4.291534e-07 4.291532e-07 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "quotient_results_df[\"fingerprintLength\"] = quotient_results_df[\"numHashes\"] - 3\n", + "quotient_results_df[\"theoreticalFPR\"] = 2.**(-quotient_results_df[\"fingerprintLength\"])\n", + "quotient_results_df[\"scaledFPR\"] = 0.9*2.**(-quotient_results_df[\"fingerprintLength\"])\n", + "quotient_results_df[\"exactCalcFPR\"] = pd.Series([probability_sum(m, q_n, f) for f in quotient_results_df[\"fingerprintLength\"]])\n", + "\n", + "quotient_results_df" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "8d180df5", + "metadata": {}, + "outputs": [], + "source": [ + "for q in [q90, q75]:\n", + " q_n = int(q*m)\n", + " quotient_results_df_75[\"fingerprintLength\"] = quotient_results_df_75[\"numHashes\"] - 3\n", + " quotient_results_df_75[\"theoreticalFPR\"] = 2.**(-quotient_results_df_75[\"fingerprintLength\"])\n", + " quotient_results_df_75[\"exactCalcFPR\"] = pd.Series([probability_sum(m, q_n, f) for f in quotient_results_df_75[\"fingerprintLength\"]])" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "1d71441f", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABAIAAAK1CAYAAABSP0xCAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/SrBM8AAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd1hT5/vH8XfC3qigOEBURNx7I4IDq7ZqndU6wNaFW9qvo3VWax217l1lWBx11q2oiKPi3lgngqhVHCAiEEh+f/AjSkFFRQl6v64rV+s5T875nAQj584zFBqNRoMQQgghhBBCCCE+CcrcDiCEEEIIIYQQQogPRwoBQgghhBBCCCHEJ0QKAUIIIYQQQgghxCdECgFCCCGEEEIIIcQnRAoBQgghhBBCCCHEJ0QKAUIIIYQQQgghxCdECgFCCCGEEEIIIcQnRAoBQgghhBBCCCHEJ0Q/twN8rNRqNbdv [...] + "text/plain": [ + "<Figure size 1200x800 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(12, 8))\n", + "\n", + "ax.plot(bloom_results_df[\"numHashes\"], bloom_results_df[\"FPR\"], label=\"Bloom\", color=\"royalblue\")\n", + "\n", + "ax.plot(quotient_results_df[\"fingerprintLength\"], quotient_results_df[\"FPR\"], label=\"quotient:0.9\", color=\"black\")\n", + "ax.plot(quotient_results_df[\"fingerprintLength\"], 2.**(-quotient_results_df[\"fingerprintLength\"]), label=\"General bound: $2^{-f}$\", color=\"deeppink\", linestyle=\":\")\n", + "\n", + "\n", + "ax.plot(quotient_results_df_75[\"fingerprintLength\"], quotient_results_df_75[\"FPR\"], label=\"quotient:0.75\", color=\"grey\")\n", + "ax.grid()\n", + "ax.set_yscale(\"log\", base=2)\n", + "ax.legend()\n", + "ax.set_xticks(quotient_results_df[\"fingerprintLength\"])\n", + "ax.set_yticks([2.**(-j) for j in range(1,25)])\n", + "ax.set_xlabel(\"Number of hash space bits $f$\")\n", + "ax.set_ylabel(\"False Positive Rate\")\n", + "fig.savefig(\"image/accuracy-vs-num-hashes.pdf\")" + ] + }, + { + "cell_type": "markdown", + "id": "f3a14a50", + "metadata": {}, + "source": [ + "Both error rates roughly follow the curve $2^{-f}$. This means that increasing the number of hash bits space by one reduces the false positive rate by a factor of two. The false positive rate of the Quotient filter is slightly smaller when it is less-densely filled. \n", + "\n", + "We can also explore the space efficiency: namely the number of bits used per element added to the filter. We see that for lower accuracy (high fpr) regimes, the Bloom filter is more efficient, \n", + "while the converse is true for higher accuracy (low fpr) regimes." + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "62b845e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 0, 'Number of hash space bits $f$')" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA/EAAAK1CAYAAACAf25bAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAADFXklEQVR4nOzdd3hU1cLF4d+kkgQSepOOcEGKfCIg0ktClaKIFKkKgiDSe5JJ6F2aoiBVERAUAeldQOGigCCCdJBeE5JASDLz/XEkV6QYhiRnkqz3eXyc2XNmZmUMkpWzz94Wu91uR0REREREREScnovZAUREREREREQkYVTiRURERERERFIIlXgRERERERGRFEIlXkRERERERCSFUIkXERERERERSSFU4kVERERERERSCJV4ERERERERkRRCJV5EREREREQkhXAzO4CzsdlsXLhwgQwZMmCxWMyOIyIiIiIiIqmc [...] + "text/plain": [ + "<Figure size 1200x800 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(12, 8))\n", + "\n", + "ax.plot(bloom_results_df[\"numHashes\"], bloom_results_df[\"bitsPerEntry\"], label=\"Bloom\", color=\"blue\")\n", + "ax.plot(quotient_results_df[\"fingerprintLength\"], quotient_results_df[\"bitsPerEntry\"], label=\"quotient\", color=\"black\")\n", + "ax.grid()\n", + "ax.legend()\n", + "ax.set_xticks(quotient_results_df[\"fingerprintLength\"])\n", + "ax.set_ylabel(\"Bits per entry\")\n", + "ax.set_xlabel(\"Number of hash space bits $f$\")" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "91d13517", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'False Positive Rate')" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABAIAAAKxCAYAAADJrg5UAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAAD8b0lEQVR4nOzdd1gU19vG8e/Si4I1liiCEjXF3rvYe+8VRVNMYo0xJjFqEmM0xhRNU0EQBXvv2Du2aEzsBTsqFkBF6r5/7E/eEBsgsKD357r2isycmXnOetg4z855jsFoNBoRERERERERkZeChbkDEBEREREREZGMo0SAiIiIiIiIyEtEiQARERERERGRl4gSASIiIiIiIiIvESUCRERERERERF4iSgSIiIiIiIiIvESUCBARERERERF5iSgRICIiIiIiIvISsTJ3AC+qhIQErly5Qvbs2TEYDOYOR0RERERERF5w [...] + "text/plain": [ + "<Figure size 1200x800 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(12, 8))\n", + "\n", + "ax.plot(bloom_results_df[\"bitsPerEntry\"], bloom_results_df[\"FPR\"], label=\"Bloom\", color=\"blue\")\n", + "\n", + "ax.plot(quotient_results_df[\"bitsPerEntry\"], quotient_results_df[\"FPR\"], label=\"quotient:0.9\", color=\"black\")\n", + "ax.plot(quotient_results_df_75[\"bitsPerEntry\"], quotient_results_df_75[\"FPR\"], label=\"quotient:0.75\", color=\"grey\")\n", + "ax.grid()\n", + "ax.set_yscale(\"log\", base=2)\n", + "ax.legend()\n", + "ax.set_xlabel(\"bits per entry\")\n", + "ax.set_ylabel(\"False Positive Rate\")" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "a0342a9b", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA/EAAAI0CAYAAABVk6kmAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAAD4wklEQVR4nOzdd3gU1dfA8e/spickIfQWSkKVphB674hSBARUuogK0qtSQhcIRREVBOlNsYBKkSJFlKLSRCmhCwJCICFAgOze94/9ZV9CypbsshtyPs+TR5g5986ZOTuYuzNzR1NKKYQQQgghhBBCCOH2dK5OQAghhBBCCCGEENaRQbwQQgghhBBCCJFJyCBeCCGEEEIIIYTIJGQQL4QQQgghhBBCZBIyiBdCCCGEEEIIITIJGcQLIYQQQgghhBCZhAzihRBCCCGEEEKITEIG8UIIIYQQQgghRCYhg3ghhBBCCCGE [...] + "text/plain": [ + "<Figure size 1200x600 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(12,6))\n", + "\n", + "\n", + "NRML_CNST = 1E6 \n", + "ax.plot(bloom_results_df[\"FPR\"], bloom_results_df[\"filterSizeBits\"]/NRML_CNST,\n", + " label=\"Bloom Filter\", color=\"red\", linestyle=\"-\")\n", + "ax.plot(quotient_results_df[\"FPR\"], quotient_results_df[\"filterSizeBits\"]/NRML_CNST,\n", + " label=\"Quotient Filter\", color=\"blue\", linestyle=\"-\")\n", + "\n", + "\n", + "ax.legend()\n", + "#ax.set_yscale(\"log\", base=2)\n", + "ax.set_xscale(\"log\", base=10)\n", + "ax.set_ylabel(\"Sketch size (Mb)\")\n", + "ax.set_xlabel(\"False Positive Rate\")\n", + "\n", + "ax.set_ylim(3,30)\n", + "ax.set_title(\"Filter size (bits) vs False Positive Rate\")\n", + "ax.yaxis.set_minor_locator(AutoMinorLocator())\n", + "ax.grid(which=\"both\", alpha=0.5)" + ] + }, + { + "cell_type": "markdown", + "id": "4536560c", + "metadata": {}, + "source": [ + "Consistent with the prior plots, we see that the there are two regions of behaviour. \n", + "We control the false positive rate for both filters, using an optimally-configured Bloom filter and a quotient filter with the number of fingerprints that achieves the fixed FPR. For large false positive rates, a Bloom filter is smaller than Quotient filter. However, if a smaller false positive rate is desired, then a Quotient filter has a lower space footprint." + ] + }, + { + "cell_type": "markdown", + "id": "d418104f", + "metadata": {}, + "source": [ + "## References\n", + "<a id=\"1\">[1]</a> \n", + "Bender, Michael A., et al. \"Don't thrash: How to cache your hash on flash.\" 3rd Workshop on Hot Topics in Storage and File Systems (HotStorage 11). 2011." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "density-venv", + "language": "python", + "name": "density_venv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/notebooks/README.md b/src/notebooks/README.md new file mode 100644 index 0000000..38a4b0c --- /dev/null +++ b/src/notebooks/README.md @@ -0,0 +1,29 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +# Quotient Filter Profiles Plotting + +This folder contains code for plotting quotient filter profiles. +The notebooks are used to generate visualizations and analyze the performance of quotient filters. + +## Usage + +To use the plotting code you may need to install the packages in +the `requirements.txt` file. + diff --git a/src/notebooks/filterSizeProfile.ipynb b/src/notebooks/filterSizeProfile.ipynb new file mode 100644 index 0000000..1b45b98 --- /dev/null +++ b/src/notebooks/filterSizeProfile.ipynb @@ -0,0 +1,1119 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Size vs Cardinality for Bloom and Quotient Filters\n", + "\n", + "We investigate the size of the filters as the cardinality of the input set increases. \n", + "Linear space is required to solve the approximate set membership problem, so we expect the size of the filters to *increase* as the input cardinality increases for a fixed \n", + "false positive rate (fpr). However, we are interested in the relative performance of the Bloom and Quotient filters at a fixed false positive rate. This enables us to contrast the performance of each filter.\n", + "\n", + "## Experiment 1. False Positive Rate $10^{-6}$\n", + "\n", + "Fix a target false positive rate and then configure the filters appropriately for increasing input size $n$.\n", + "The Quotient filter has its number of slots rounded to the next power of two larger than the current $n$ and the Bloom filter is set optimally for any given $n$.\n", + "As a result, the Bloom filter results are as optimistic as can be." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from matplotlib.ticker import AutoMinorLocator\n", + "from typing import Tuple\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def parse_size_results(filename:str) -> pd.DataFrame:\n", + " with open(filename) as f:\n", + " lines = f.readlines()[2:] # first two lines not needed\n", + " ignore_from_idx = lines.index(\"PROPERTIES:\\n\")\n", + " results = lines[:ignore_from_idx-1] # there is an empty line preceding the ignore_from_idx\n", + " results_clean = [r.strip(\"\\n\") for r in results]\n", + " results_df = pd.DataFrame(sub.split(\"\\t\") for sub in results_clean[1:])\n", + " results_df.columns = results_clean[0].split(\"\\t\")\n", + " results_df.rename(columns={\"TrueU\" : \"inputCardinality\"}, inplace=True)\n", + " for col in results_df.columns:\n", + " results_df[col] = pd.to_numeric(results_df[col])\n", + " return results_df" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "bf_sizes = parse_size_results(\"1e-6/BloomFilterSizeProfile20240703_014140PST.txt\")\n", + "qf_sizes = parse_size_results(\"1e-6/QuotientFilterSizeProfile20240703_014255PST_1E-6.txt\") # 1e-6\n", + "\n", + "for df in [bf_sizes, qf_sizes]:\n", + " df[\"bitsPerElement\"] = df[\"Size\"] / df[\"inputCardinality\"]\n", + "\n", + "for qf in [qf_sizes]:\n", + " qf[\"numSlots\"] = np.ceil(np.log2(qf[\"inputCardinality\"] / 0.9 )).astype(int) # nb this is because the suggestion function always uses 0.9 and may change in futuer.\n", + " qf[\"loadFactor\"] = qf[\"inputCardinality\"] / (2**qf[\"numSlots\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>inputCardinality</th>\n", + " <th>Size</th>\n", + " <th>NumTrials</th>\n", + " <th>FalsePositiveRate</th>\n", + " <th>NumHashBits</th>\n", + " <th>bitsPerElement</th>\n", + " <th>numSlots</th>\n", + " <th>loadFactor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>3</td>\n", + " <td>176</td>\n", + " <td>16384</td>\n", + " <td>9.536743e-07</td>\n", + " <td>20</td>\n", + " <td>58.666667</td>\n", + " <td>2</td>\n", + " <td>0.750000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>4</td>\n", + " <td>184</td>\n", + " <td>10922</td>\n", + " <td>2.384186e-07</td>\n", + " <td>20</td>\n", + " <td>46.000000</td>\n", + " <td>3</td>\n", + " <td>0.500000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>5</td>\n", + " <td>184</td>\n", + " <td>8192</td>\n", + " <td>4.768372e-07</td>\n", + " <td>20</td>\n", + " <td>36.800000</td>\n", + " <td>3</td>\n", + " <td>0.625000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>6</td>\n", + " <td>184</td>\n", + " <td>6553</td>\n", + " <td>4.768372e-07</td>\n", + " <td>20</td>\n", + " <td>30.666667</td>\n", + " <td>3</td>\n", + " <td>0.750000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>7</td>\n", + " <td>352</td>\n", + " <td>5461</td>\n", + " <td>2.384186e-07</td>\n", + " <td>20</td>\n", + " <td>50.285714</td>\n", + " <td>3</td>\n", + " <td>0.875000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>169</th>\n", + " <td>794672</td>\n", + " <td>24117248</td>\n", + " <td>1024</td>\n", + " <td>1.192093e-06</td>\n", + " <td>20</td>\n", + " <td>30.348682</td>\n", + " <td>20</td>\n", + " <td>0.757858</td>\n", + " </tr>\n", + " <tr>\n", + " <th>170</th>\n", + " <td>851708</td>\n", + " <td>24117248</td>\n", + " <td>1024</td>\n", + " <td>1.192093e-06</td>\n", + " <td>20</td>\n", + " <td>28.316334</td>\n", + " <td>20</td>\n", + " <td>0.812252</td>\n", + " </tr>\n", + " <tr>\n", + " <th>171</th>\n", + " <td>912838</td>\n", + " <td>24117248</td>\n", + " <td>1024</td>\n", + " <td>2.384186e-07</td>\n", + " <td>20</td>\n", + " <td>26.420075</td>\n", + " <td>20</td>\n", + " <td>0.870550</td>\n", + " </tr>\n", + " <tr>\n", + " <th>172</th>\n", + " <td>978356</td>\n", + " <td>48234496</td>\n", + " <td>1024</td>\n", + " <td>7.152557e-07</td>\n", + " <td>20</td>\n", + " <td>49.301579</td>\n", + " <td>21</td>\n", + " <td>0.466516</td>\n", + " </tr>\n", + " <tr>\n", + " <th>173</th>\n", + " <td>1048576</td>\n", + " <td>48234496</td>\n", + " <td>1024</td>\n", + " <td>2.384186e-07</td>\n", + " <td>20</td>\n", + " <td>46.000000</td>\n", + " <td>21</td>\n", + " <td>0.500000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>174 rows × 8 columns</p>\n", + "</div>" + ], + "text/plain": [ + " inputCardinality Size NumTrials FalsePositiveRate NumHashBits \\\n", + "0 3 176 16384 9.536743e-07 20 \n", + "1 4 184 10922 2.384186e-07 20 \n", + "2 5 184 8192 4.768372e-07 20 \n", + "3 6 184 6553 4.768372e-07 20 \n", + "4 7 352 5461 2.384186e-07 20 \n", + ".. ... ... ... ... ... \n", + "169 794672 24117248 1024 1.192093e-06 20 \n", + "170 851708 24117248 1024 1.192093e-06 20 \n", + "171 912838 24117248 1024 2.384186e-07 20 \n", + "172 978356 48234496 1024 7.152557e-07 20 \n", + "173 1048576 48234496 1024 2.384186e-07 20 \n", + "\n", + " bitsPerElement numSlots loadFactor \n", + "0 58.666667 2 0.750000 \n", + "1 46.000000 3 0.500000 \n", + "2 36.800000 3 0.625000 \n", + "3 30.666667 3 0.750000 \n", + "4 50.285714 3 0.875000 \n", + ".. ... ... ... \n", + "169 30.348682 20 0.757858 \n", + "170 28.316334 20 0.812252 \n", + "171 26.420075 20 0.870550 \n", + "172 49.301579 21 0.466516 \n", + "173 46.000000 21 0.500000 \n", + "\n", + "[174 rows x 8 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qf_sizes" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# merged_df = pd.merge(bf_sizes, bf_sizes_less_one, on='inputCardinality')\n", + "# merged_df" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Size (bits) vs Input Cardinality')" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA+wAAAIoCAYAAADz12GeAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/SrBM8AAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOydeVhc5d3+75mBYWCGZdjCMuxLIAshQIhGMaTRRq1aG9NaazXRvtbWahOtttX21aaLdrVJLS61dWnfusQk2v5aq7UxMWkWEkgIEEiABBJ2CAwDM8PALOf3B5kjw2xn5hwOM+T7uS4unfPczzLP+XLnOZxnkTAMw4AgCIIgCIIgCIIgiIBCOtcNIAiCIAiCIAiCIAjCGXpgJwiCIAiCIAiCIIgAhB7YCYIgCIIgCIIgCCIAoQd2giAIgiAIgiAIgghA6IGdIAiCIAiCIAiCIAIQemAnCIIgCIIgCIIg [...] + "text/plain": [ + "<Figure size 1200x600 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(12, 6))\n", + "SCALE = 1\n", + "ax.plot(bf_sizes[\"inputCardinality\"], bf_sizes[\"Size\"]/SCALE, color=\"blue\", marker=\"o\", label=r\"Bloom: $fpp = 10^{-6}$\")\n", + "\n", + "# Quotient \n", + "ax.plot(qf_sizes[\"inputCardinality\"], qf_sizes[\"Size\"]/SCALE, color=\"black\", marker=\"o\", label=r\"Quotient: $fpp = 10^{-6}$\")\n", + "ax.legend()\n", + "ax.grid()\n", + "ax.set_yscale(\"log\", base=10)\n", + "ax.set_xscale(\"log\", base=10)\n", + "ax.set_ylabel(\"Size (bits)\")\n", + "ax.set_xlabel(\"Input Cardinality\")\n", + "ax.grid(which='both', linestyle='--', linewidth=0.5)\n", + "ax.set_title(\"Size (bits) vs Input Cardinality\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Size (bits) vs Input Cardinality')" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA+wAAAIoCAYAAADz12GeAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/SrBM8AAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOydeVxc1d3/3zPDMjBsQ4AQQhIIhJCdAEnc4x7jVq1W2/pYo9ZqrfvWulRrtW5Vq7Wp2vrUpfWpW9X211qtVWs0JiQhIZAVSCCBEJbAMDAMAzPM/f2Bc8syy52Zy4Uh5/168dKZ8znne+bcL5+cy9xzjk6SJAmBQCAQCAQCgUAgEAgEEwr9eHdAIBAIBAKBQCAQCAQCwWjEDbtAIBAIBAKBQCAQCAQTEHHDLhAIBAKBQCAQCAQCwQRE3LALBAKBQCAQCAQCgUAwARE37AKBQCAQCAQCgUAgEExAxA27 [...] + "text/plain": [ + "<Figure size 1200x600 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(12, 6))\n", + "SCALE = 1\n", + "ax.plot(bf_sizes[\"inputCardinality\"], bf_sizes[\"Size\"]/SCALE, color=\"blue\", label=r\"Bloom: $fpp = 10^{-6}$\")\n", + "ax.plot(qf_sizes[\"inputCardinality\"], qf_sizes[\"Size\"]/SCALE, color=\"black\", label=r\"Quotient: $fpp = 10^{-6}$\", linestyle=\"--\")\n", + "\n", + "ax.legend()\n", + "ax.grid()\n", + "ax.set_yscale(\"log\", base=10)\n", + "ax.set_xscale(\"log\", base=10)\n", + "ax.set_ylabel(\"Size (bits)\")\n", + "ax.set_xlabel(\"Input Cardinality\")\n", + "ax.grid(which='both', linestyle='--', linewidth=0.5)\n", + "ax.set_title(\"Size (bits) vs Input Cardinality\")\n", + "#fig.savefig(\"image/space-vs-cardinality.pdf\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looking closely, we can see that there is a small regime where the Quotient filter space in bits seems to drop beneath that for the Bloom filter. Let's zoom in on that region in the following plots.\n", + "First, we can see that as $n$ increases, the load factor is not constant. The load factor is the ratio of the number of elements in the filter to the filter's length. This behaviour is to be expected since the Quotient filter length must always be a power of two, so once a certain (parameterised) \n", + "threshold is met, then the filter length must be doubled. In this example, we see this behaviour in the vertical drops between about $90\\%$ to $\\approx 45\\%$ load factor." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[<matplotlib.axis.XTick at 0x125f1fd10>,\n", + " <matplotlib.axis.XTick at 0x125f5d450>,\n", + " <matplotlib.axis.XTick at 0x125f1f9d0>,\n", + " <matplotlib.axis.XTick at 0x125fe2390>,\n", + " <matplotlib.axis.XTick at 0x125fe8750>,\n", + " <matplotlib.axis.XTick at 0x125feabd0>,\n", + " <matplotlib.axis.XTick at 0x125ff0ed0>,\n", + " <matplotlib.axis.XTick at 0x125ff30d0>,\n", + " <matplotlib.axis.XTick at 0x125ff00d0>,\n", + " <matplotlib.axis.XTick at 0x125ff5e50>,\n", + " <matplotlib.axis.XTick at 0x125ffc210>,\n", + " <matplotlib.axis.XTick at 0x125ffe410>,\n", + " <matplotlib.axis.XTick at 0x126000650>,\n", + " <matplotlib.axis.XTick at 0x125ff4750>,\n", + " <matplotlib.axis.XTick at 0x126003050>,\n", + " <matplotlib.axis.XTick at 0x1260092d0>,\n", + " <matplotlib.axis.XTick at 0x12600b490>,\n", + " <matplotlib.axis.XTick at 0x1260116d0>,\n", + " <matplotlib.axis.XTick at 0x126001a10>,\n", + " <matplotlib.axis.XTick at 0x126018290>,\n", + " <matplotlib.axis.XTick at 0x12601a4d0>]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABboAAAIRCAYAAACMHQu3AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/SrBM8AAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOy9eXhdV3nv/zk6tuTIlmzLtmTLR5ad2JnIAKQkECA0JCGFkECgaRhKSggkTIUfuXApNGGm4UKhuYEMDggopVxyS7nctqQtNGUoFJpLRiAhceJRlmzLgyzJ1njO+f2xs/basiVbRzpn77X3+n6ehwdtWTp765v33Wutd73rfXPlcrmMEEIIIYQQQgghhBBCCJFS6pJ+ACGEEEIIIYQQQgghhBBiLijQLYQQQgghhBBCCCGEECLVKNAthBBCCCGEEEIIIYQQItUo0C2EEEIIIYQQQgghhBAi1SjQLYQQ [...] + "text/plain": [ + "<Figure size 1800x600 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(18, 6))\n", + "ax.plot(qf_sizes[\"inputCardinality\"], qf_sizes[\"loadFactor\"], color=\"black\", marker=\".\", label=r\"Size ratio\")\n", + "\n", + "ax.set_ylabel(\"Load Factor\")\n", + "ax.set_xlabel(\"Input Cardinality\")\n", + "ax.set_xscale(\"log\", base=2)\n", + "ax.grid()\n", + "ax.set_xticks([2**i for i in range(0, 21)])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABboAAAIRCAYAAACMHQu3AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/SrBM8AAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOydeZgcVbn/P1U9Mz1rz2Sf7CsJIUAWCBGC7BKQiCCKC27wwxXUiMvVq7LI9XJFveJ2Xe81iih65YJKWAQURBDIQlgkBAKBBLKvMz1LT3dV/f7orpruru6equrqpU6dz/P0k+6equ7qb973nFPvec97FMMwDCQSiUQikUgkEolEIpFIJBKJRCIJKGqtL0AikUgkEolEIpFIJBKJRCKRSCSScpCBbolEIpFIJBKJRCKRSCQSiUQikQQaGeiWSCQSiUQikUgkEolEIpFIJBJJoJGBbolEIpFIJBKJRCKR [...] + "text/plain": [ + "<Figure size 1800x600 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(18, 6))\n", + "ax.plot(bf_sizes[\"inputCardinality\"], bf_sizes[\"Size\"]/qf_sizes[\"Size\"], color=\"grey\", marker=\".\", label=r\"Size ratio\")\n", + "ax.axhline(y=1, color='red', linestyle='--', linewidth=2, label='Bloom filter smaller below this line')\n", + "\n", + "\n", + "\n", + "# Plot vertical lines at 0.9*2^j for each j where we double to length of the quotient filter\n", + "j_values = range(2, 21)\n", + "jval_col = \"magenta\"\n", + "ival_col = \"skyblue\"\n", + "for i, j in enumerate(j_values):\n", + " x = 0.9 * 2**j\n", + " if i == len(j_values) - 1:\n", + " ax.axvline(x=x, color=jval_col, linestyle=':', alpha=0.5, label=r\"$0.9 \\cdot 2^j$\")\n", + " else:\n", + " ax.axvline(x=x, color=jval_col, linestyle=':', alpha=0.5)\n", + "\n", + "\n", + "ax.legend()\n", + "ax.grid()\n", + "ax.set_xscale(\"log\", base=2)\n", + "ax.set_ylabel(\"Size Ratio\")\n", + "ax.set_xlabel(\"Input Cardinality\")\n", + "ax.grid(which='both', linestyle='--', linewidth=0.5)\n", + "#fig.savefig(\"image/bloom-quotient-ratio.pdf\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see that _for a fixed false positive rate_, the Bloom filter is smaller than the Quotient filter in most regimes, unless the Quotient filter close to, but not exceeding $90\\%$.\n", + "\n", + "### Experiment 1a.\n", + "\n", + "We can also do a rough analysis of the false positive rate over input cardinalities (note that a more refined analysis will follow).\n", + "In this plot we see that the Bloom filter probability approximations do not work well in the small cardinality regime and then approach the $2^{-f}$ bound for large cardinalities. On the other hand, the Quotient filter error behaviour consistently remains below the $2^{-f}$ probability error bound. There are a small number of exceptions, but these are deferred for later analysis." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Measured False Positive Rate vs. Input Cardinality (zeros occur at non-uniform points)')" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA/QAAAIoCAYAAADHrqCCAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/SrBM8AAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOydd3hUVf6435lJmcykd0ICgYQSQgkkhFVRQJoo2MVusC12kcVd9auyloVdbFgQ2yquq7uu3bVjRZEWIPQaQgglJCHJJJOemfP7g9/czaSQAW6YOfG8z5MHcufMve/9zLkn98w953MMQgiBQqFQKBQKhUKhUCgUCqkweltAoVAoFAqFQqFQKBQKxfGjOvQKhUKhUCgUCoVCoVBIiOrQKxQKhUKhUCgUCoVCISGqQ69QKBQKhUKhUCgUCoWEqA69QqFQKBQKhUKhUCgUEqI69AqFQqFQKBQKhUKhUEiI [...] + "text/plain": [ + "<Figure size 1200x600 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(12, 6))\n", + "ax.plot(bf_sizes[\"inputCardinality\"], bf_sizes[\"FalsePositiveRate\"], color=\"blue\", marker=\"o\", label=r\"Bloom target $fpp = 10^{-6}$\")\n", + "\n", + "# Quotient \n", + "ax.plot(qf_sizes[\"inputCardinality\"], qf_sizes[\"FalsePositiveRate\"], color=\"black\", label=r\"Quotient target $fpp = 10^{-6}$\")\n", + "ax.plot(qf_sizes[\"inputCardinality\"][::3], qf_sizes[\"FalsePositiveRate\"][::3], color=\"black\", marker=\"o\", linestyle=\"None\") # just for markers ::3 aribitrarily chosen.\n", + "\n", + "xvals = np.array([2**j for j in range(21)])\n", + "ax.plot(xvals, (1e-6)*np.ones_like(xvals), label=\"Target FPP = $10^{-6}$\", color=\"red\", linestyle=\"--\")\n", + "ax.legend()\n", + "ax.grid()\n", + "ax.set_yscale(\"log\", base=10)\n", + "ax.set_ylim(1e-7, 1)\n", + "ax.set_xscale(\"log\", base=2)\n", + "ax.set_xticks([2**i for i in range(0, 21)])\n", + "ax.set_ylabel(\"FPR (measured)\")\n", + "ax.set_xlabel(\"Input Cardinality\")\n", + "ax.grid(which='both', linestyle='--', linewidth=0.5)\n", + "ax.set_title(\"Measured False Positive Rate vs. Input Cardinality (zeros occur at non-uniform points)\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Experiment 2. Changing the false positive rate\n", + "\n", + "We repeat the above with a larger false positive rate. The main \n", + "takeaway is that the Quotient filter is less attractive in this setting, consistent with prior plots." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "bf_sizes_1e3 = parse_size_results(\"1e-3/BloomFilterSizeProfile20240703_045451PST.txt\")\n", + "qf_sizes_1e3 = parse_size_results(\"1e-3/QuotientFilterSizeProfile20240703_045733PST.txt\")\n", + "\n", + "for df in [bf_sizes_1e3, qf_sizes_1e3]:\n", + " df[\"bitsPerElement\"] = df[\"Size\"] / df[\"inputCardinality\"]\n", + "\n", + "for qf in [qf_sizes_1e3]:\n", + " qf[\"numSlots\"] = np.ceil(np.log2(qf[\"inputCardinality\"] / 0.9 )).astype(int) # nb this is because the suggestion function always uses 0.9 and may change in futuer.\n", + " qf[\"loadFactor\"] = qf[\"inputCardinality\"] / (2**qf[\"numSlots\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>inputCardinality</th>\n", + " <th>Size</th>\n", + " <th>NumTrials</th>\n", + " <th>FalsePositiveRate</th>\n", + " <th>NumHashBits</th>\n", + " <th>bitsPerElement</th>\n", + " <th>numSlots</th>\n", + " <th>loadFactor</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>3</td>\n", + " <td>96</td>\n", + " <td>16384</td>\n", + " <td>0.000244</td>\n", + " <td>10</td>\n", + " <td>32.000000</td>\n", + " <td>2</td>\n", + " <td>0.750000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>4</td>\n", + " <td>104</td>\n", + " <td>10922</td>\n", + " <td>0.000732</td>\n", + " <td>10</td>\n", + " <td>26.000000</td>\n", + " <td>3</td>\n", + " <td>0.500000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>5</td>\n", + " <td>104</td>\n", + " <td>8192</td>\n", + " <td>0.000488</td>\n", + " <td>10</td>\n", + " <td>20.800000</td>\n", + " <td>3</td>\n", + " <td>0.625000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>6</td>\n", + " <td>104</td>\n", + " <td>6553</td>\n", + " <td>0.000244</td>\n", + " <td>10</td>\n", + " <td>17.333333</td>\n", + " <td>3</td>\n", + " <td>0.750000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>7</td>\n", + " <td>192</td>\n", + " <td>5461</td>\n", + " <td>0.000732</td>\n", + " <td>10</td>\n", + " <td>27.428571</td>\n", + " <td>3</td>\n", + " <td>0.875000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>169</th>\n", + " <td>794672</td>\n", + " <td>13631488</td>\n", + " <td>1024</td>\n", + " <td>0.000000</td>\n", + " <td>10</td>\n", + " <td>17.153603</td>\n", + " <td>20</td>\n", + " <td>0.757858</td>\n", + " </tr>\n", + " <tr>\n", + " <th>170</th>\n", + " <td>851708</td>\n", + " <td>13631488</td>\n", + " <td>1024</td>\n", + " <td>0.000244</td>\n", + " <td>10</td>\n", + " <td>16.004884</td>\n", + " <td>20</td>\n", + " <td>0.812252</td>\n", + " </tr>\n", + " <tr>\n", + " <th>171</th>\n", + " <td>912838</td>\n", + " <td>13631488</td>\n", + " <td>1024</td>\n", + " <td>0.000977</td>\n", + " <td>10</td>\n", + " <td>14.933086</td>\n", + " <td>20</td>\n", + " <td>0.870550</td>\n", + " </tr>\n", + " <tr>\n", + " <th>172</th>\n", + " <td>978356</td>\n", + " <td>27262976</td>\n", + " <td>1024</td>\n", + " <td>0.000244</td>\n", + " <td>10</td>\n", + " <td>27.866110</td>\n", + " <td>21</td>\n", + " <td>0.466516</td>\n", + " </tr>\n", + " <tr>\n", + " <th>173</th>\n", + " <td>1048576</td>\n", + " <td>27262976</td>\n", + " <td>1024</td>\n", + " <td>0.000000</td>\n", + " <td>10</td>\n", + " <td>26.000000</td>\n", + " <td>21</td>\n", + " <td>0.500000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>174 rows × 8 columns</p>\n", + "</div>" + ], + "text/plain": [ + " inputCardinality Size NumTrials FalsePositiveRate NumHashBits \\\n", + "0 3 96 16384 0.000244 10 \n", + "1 4 104 10922 0.000732 10 \n", + "2 5 104 8192 0.000488 10 \n", + "3 6 104 6553 0.000244 10 \n", + "4 7 192 5461 0.000732 10 \n", + ".. ... ... ... ... ... \n", + "169 794672 13631488 1024 0.000000 10 \n", + "170 851708 13631488 1024 0.000244 10 \n", + "171 912838 13631488 1024 0.000977 10 \n", + "172 978356 27262976 1024 0.000244 10 \n", + "173 1048576 27262976 1024 0.000000 10 \n", + "\n", + " bitsPerElement numSlots loadFactor \n", + "0 32.000000 2 0.750000 \n", + "1 26.000000 3 0.500000 \n", + "2 20.800000 3 0.625000 \n", + "3 17.333333 3 0.750000 \n", + "4 27.428571 3 0.875000 \n", + ".. ... ... ... \n", + "169 17.153603 20 0.757858 \n", + "170 16.004884 20 0.812252 \n", + "171 14.933086 20 0.870550 \n", + "172 27.866110 21 0.466516 \n", + "173 26.000000 21 0.500000 \n", + "\n", + "[174 rows x 8 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qf_sizes_1e3" + ] + }, + { + "cell_type": "code", + "execution_count": 252, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>inputCardinality</th>\n", + " <th>Size</th>\n", + " <th>NumTrials</th>\n", + " <th>FalsePositiveRate</th>\n", + " <th>NumHashBits</th>\n", + " <th>bitsPerElement</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>3</td>\n", + " <td>64</td>\n", + " <td>16384</td>\n", + " <td>0.006104</td>\n", + " <td>11</td>\n", + " <td>21.333333</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>4</td>\n", + " <td>64</td>\n", + " <td>10922</td>\n", + " <td>0.016968</td>\n", + " <td>11</td>\n", + " <td>16.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>5</td>\n", + " <td>128</td>\n", + " <td>8192</td>\n", + " <td>0.003174</td>\n", + " <td>10</td>\n", + " <td>25.600000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>6</td>\n", + " <td>128</td>\n", + " <td>6553</td>\n", + " <td>0.002930</td>\n", + " <td>11</td>\n", + " <td>21.333333</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>7</td>\n", + " <td>128</td>\n", + " <td>5461</td>\n", + " <td>0.002808</td>\n", + " <td>11</td>\n", + " <td>18.285714</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>169</th>\n", + " <td>794672</td>\n", + " <td>11425472</td>\n", + " <td>1024</td>\n", + " <td>0.000488</td>\n", + " <td>10</td>\n", + " <td>14.377595</td>\n", + " </tr>\n", + " <tr>\n", + " <th>170</th>\n", + " <td>851708</td>\n", + " <td>12245568</td>\n", + " <td>1024</td>\n", + " <td>0.001465</td>\n", + " <td>10</td>\n", + " <td>14.377660</td>\n", + " </tr>\n", + " <tr>\n", + " <th>171</th>\n", + " <td>912838</td>\n", + " <td>13124416</td>\n", + " <td>1024</td>\n", + " <td>0.001465</td>\n", + " <td>10</td>\n", + " <td>14.377596</td>\n", + " </tr>\n", + " <tr>\n", + " <th>172</th>\n", + " <td>978356</td>\n", + " <td>14066432</td>\n", + " <td>1024</td>\n", + " <td>0.001709</td>\n", + " <td>10</td>\n", + " <td>14.377621</td>\n", + " </tr>\n", + " <tr>\n", + " <th>173</th>\n", + " <td>1048576</td>\n", + " <td>15076032</td>\n", + " <td>1024</td>\n", + " <td>0.000732</td>\n", + " <td>10</td>\n", + " <td>14.377625</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>174 rows × 6 columns</p>\n", + "</div>" + ], + "text/plain": [ + " inputCardinality Size NumTrials FalsePositiveRate NumHashBits \\\n", + "0 3 64 16384 0.006104 11 \n", + "1 4 64 10922 0.016968 11 \n", + "2 5 128 8192 0.003174 10 \n", + "3 6 128 6553 0.002930 11 \n", + "4 7 128 5461 0.002808 11 \n", + ".. ... ... ... ... ... \n", + "169 794672 11425472 1024 0.000488 10 \n", + "170 851708 12245568 1024 0.001465 10 \n", + "171 912838 13124416 1024 0.001465 10 \n", + "172 978356 14066432 1024 0.001709 10 \n", + "173 1048576 15076032 1024 0.000732 10 \n", + "\n", + " bitsPerElement \n", + "0 21.333333 \n", + "1 16.000000 \n", + "2 25.600000 \n", + "3 21.333333 \n", + "4 18.285714 \n", + ".. ... \n", + "169 14.377595 \n", + "170 14.377660 \n", + "171 14.377596 \n", + "172 14.377621 \n", + "173 14.377625 \n", + "\n", + "[174 rows x 6 columns]" + ] + }, + "execution_count": 252, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bf_sizes_1e3" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Size (bits) vs Input Cardinality')" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA+wAAAIoCAYAAADz12GeAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/SrBM8AAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOy9eXwb1b3+/0jyIlvyvtvyFtvZ7Tix40BCiEOAAC1QQoALlCa0pZQ1gV/pve1tWbrALfdemrQ10NIl0BK2JC18Wy4UQtKkJHFiJ16SOPESJ/Fux5ZlS7IkS5rfH0ZTyxpJI814rHE+79fLL5DOc5Y58/GTOZ6zKBiGYUAQBEEQBEEQBEEQREihnOkGEARBEARBEARBEAThCQ3YCYIgCIIgCIIgCCIEoQE7QRAEQRAEQRAEQYQgNGAnCIIgCIIgCIIgiBCEBuwEQRAEQRAEQRAEEYLQgJ0gCIIgCIIg [...] + "text/plain": [ + "<Figure size 1200x600 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(12, 6))\n", + "SCALE = 1\n", + "ax.plot(bf_sizes_1e3[\"inputCardinality\"], bf_sizes_1e3[\"Size\"]/SCALE, color=\"blue\", marker=\"o\", label=r\"Bloom: $fpp = 10^{-3}$\")\n", + "\n", + "# Quotient \n", + "ax.plot(qf_sizes_1e3[\"inputCardinality\"], qf_sizes_1e3[\"Size\"]/SCALE, color=\"black\", marker=\"o\", label=r\"Quotient: $fpp = 10^{-3}$\")\n", + "\n", + "\n", + "ax.legend()\n", + "ax.grid()\n", + "ax.set_yscale(\"log\", base=10)\n", + "ax.set_xscale(\"log\", base=10)\n", + "ax.set_ylabel(\"Size (bits)\")\n", + "ax.set_xlabel(\"Input Cardinality\")\n", + "ax.grid(which='both', linestyle='--', linewidth=0.5)\n", + "ax.set_title(\"Size (bits) vs Input Cardinality\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Size ratio of Bloom Filter to Quotient Filter')" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABboAAAIoCAYAAACvaF3SAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/SrBM8AAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3wUdf4/8NfuppdNCKlASKjSexEsoKIcCipn9zwBFRAVRE79yp2ncBbu8FQ45bCcCraf9SyHiGJBEZUOShUwoRhqKulb5vdHMsNOPrvJzmT7vJ6PRx5sNrs7kxfv98zks7OfMUmSJIGIiIiIiIiIiIiIKEyZg70CREREREREREREREStwYFuIiIiIiIiIiIiIgprHOgmIiIiIiIiIiIiorDGgW4iIiIiIiIiIiIiCmsc6CYiIiIiIiIiIiKisMaBbiIiIiIiIiIiIiIKaxzoJiIiIiIiIiIiIqKw [...] + "text/plain": [ + "<Figure size 1800x600 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(18, 6))\n", + "ax.plot(bf_sizes_1e3[\"inputCardinality\"], bf_sizes_1e3[\"Size\"]/qf_sizes_1e3[\"Size\"], color=\"grey\", marker=\".\", label=r\"Size ratio (fpp <= $10^{-3}$)\")\n", + "ax.axhline(y=1, color='red', linestyle='--', linewidth=2, label='Bloom filter smaller below this line')\n", + "\n", + "# Plot vertical lines at 0.9*2^j for each j\n", + "j_values = range(2, 21)\n", + "jval_col = \"magenta\"\n", + "for i, j in enumerate(j_values):\n", + " x = 0.9 * 2**j\n", + " if i == len(j_values) - 1:\n", + " ax.axvline(x=x, color=jval_col, linestyle=':', alpha=0.5, label=r\"$0.9 \\cdot 2^j$\")\n", + " else:\n", + " ax.axvline(x=x, color=jval_col, linestyle=':', alpha=0.5)\n", + "\n", + "\n", + "ax.legend()\n", + "ax.grid()\n", + "ax.set_xscale(\"log\", base=2)\n", + "ax.set_ylabel(\"Size Ratio\")\n", + "ax.set_xlabel(\"Input Cardinality\")\n", + "ax.grid(which='both', linestyle='--', linewidth=0.5)\n", + "ax.set_title(\"Size ratio of Bloom Filter to Quotient Filter\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Measured False Positive Rate vs. Input Cardinality (zeros occur at non-uniform points)')" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA/QAAAIoCAYAAADHrqCCAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/SrBM8AAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOydd3hUVd6A35lJmcykd0ICCQkllCSQEFZBQQEREcWGXbAtdpHFXXRVrLBrxVXEtpa1fa4NXQt2RVEDAUKvIYTQQkmdSZ853x/ZuZtJIRO4YebgeZ8nD+TOmXvf+5tzT+6Ze87vGIQQAoVCoVAoFAqFQqFQKBRSYfS2gEKhUCgUCoVCoVAoFIquozr0CoVCoVAoFAqFQqFQSIjq0CsUCoVCoVAoFAqFQiEhqkOvUCgUCoVCoVAoFAqFhKgOvUKhUCgUCoVCoVAoFBKiOvQKhUKhUCgUCoVCoVBIiOrQ [...] + "text/plain": [ + "<Figure size 1200x600 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(12, 6))\n", + "ax.plot(bf_sizes_1e3[\"inputCardinality\"], bf_sizes_1e3[\"FalsePositiveRate\"], color=\"blue\", marker=\"o\", label=r\"Bloom target $fpp = 10^{-3}$\")\n", + "\n", + "# Quotient \n", + "ax.plot(qf_sizes_1e3[\"inputCardinality\"], qf_sizes_1e3[\"FalsePositiveRate\"], color=\"black\", label=r\"Quotient target $fpp = 10^{-3}$\")\n", + "ax.plot(qf_sizes_1e3[\"inputCardinality\"][::3], qf_sizes_1e3[\"FalsePositiveRate\"][::3], color=\"black\", marker=\"o\", linestyle=\"None\") \n", + "\n", + "xvals = np.array([2**j for j in range(21)])\n", + "ax.plot(xvals, (1e-3)*np.ones_like(xvals), label=\"Target FPP = $10^{-e}$\", color=\"red\", linestyle=\"--\")\n", + "ax.legend()\n", + "ax.grid()\n", + "ax.set_yscale(\"log\", base=10)\n", + "ax.set_ylim(1e-7, 1)\n", + "ax.set_xscale(\"log\", base=2)\n", + "ax.set_xticks([2**i for i in range(0, 21)])\n", + "ax.set_ylabel(\"FPR (measured)\")\n", + "ax.set_xlabel(\"Input Cardinality\")\n", + "ax.grid(which='both', linestyle='--', linewidth=0.5)\n", + "ax.set_title(\"Measured False Positive Rate vs. Input Cardinality (zeros occur at non-uniform points)\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From these plots we see roughly the same error behaviour as when the fpr is $1e-6$, however the gaps between the two filters have reduced. This supports the theoretical understanding that if higher accuracy is desired then Quotient filter should be more strongly considered compared to at lower false positive rates." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "density-venv", + "language": "python", + "name": "density_venv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/notebooks/qf_probability_sum.py b/src/notebooks/qf_probability_sum.py new file mode 100644 index 0000000..0f8ccf9 --- /dev/null +++ b/src/notebooks/qf_probability_sum.py @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np +from scipy.stats import binom as binom_dist + +def qf_probability_sum(num_slots:int|np.int64, num_elements:int|np.int64, fingerprint_length:int|np.int64) -> float: + """ + This sum is well approximated by alpha*2^(-f) for alpha = num_elements / num_slots + :param num_slots: The number of slots in the filter + :param num_elements: The number of elements to be inserted into the filter + :param fingerprint_length: The length of the fingerprint used in the filter + :return: The sum of probabilities + """ + m = num_slots + n = num_elements + f = fingerprint_length + assert isinstance(m, int) | isinstance(f, np.int64), "num_slots must be an integer" + assert isinstance(n, int) | isinstance(f, np.int64), "num_elements must be an integer" + assert isinstance(f, int) | isinstance(f, np.int64), "fingerprint_length must be an integer" + dist = binom_dist(n, 1./m) + probs = dist.pmf(np.arange(m+1)) + probs *= (1. - (1. - 2.**(-f))**np.arange(m+1)) + return np.sum(probs) \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
