add two ver for blog writing

chychen · chychen · commit 50141c636e07 · 2020-08-27T14:09:35.000Z
diff --git a/cpu_based_version.ipynb b/cpu_based_version.ipynb
@@ -0,0 +1,147 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download Example Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ! FILEID=\"1OO0tUguZMyQ1d37K7F9jiwV7mm_z2yuD\" && FILENAME=\"example_data.npy\" && wget --load-cookies /tmp/cookies.txt \"https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id='$FILEID -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\\1\\n/p')&id=$FILEID\" -O $FILENAME && rm -rf /tmp/cookies.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import Numba CUDA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from numba import cuda\n",
+    "import numpy as np\n",
+    "import math"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = np.load('example_data.npy')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Original (CPU-based)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def ridge_detection(f, thres):\n",
+    "    count = np.zeros(f.shape)\n",
+    "    for i in range(len(f)):\n",
+    "        for j in range(len(f[i])):\n",
+    "            if (\n",
+    "                i > 0\n",
+    "                and j > 0\n",
+    "                and i < (len(f) - 1)\n",
+    "                and j < (len(f[i]) - 1)\n",
+    "                and f[i, j] > thres\n",
+    "                and ~np.isnan(f[i, j])\n",
+    "            ):\n",
+    "                step_i = i\n",
+    "                step_j = j\n",
+    "                for k in range(1000):\n",
+    "                    if (\n",
+    "                        step_i == 0\n",
+    "                        or step_j == 0\n",
+    "                        or step_i == (len(f) - 1)\n",
+    "                        or step_j == (len(f[i]) - 1)\n",
+    "                    ):\n",
+    "                        break\n",
+    "                    index = np.nanargmax(\n",
+    "                        f[step_i - 1 : step_i + 2, step_j - 1 : step_j + 2].data\n",
+    "                    )\n",
+    "                    vmax = np.nanmax(\n",
+    "                        f[step_i - 1 : step_i + 2, step_j - 1 : step_j + 2].data\n",
+    "                    )\n",
+    "                    if index == 4 or vmax == f[step_i, step_j] or np.isnan(vmax):\n",
+    "                        break\n",
+    "                    row = int(index / 3)\n",
+    "                    col = index % 3\n",
+    "                    count[step_i - 1 + row, step_j - 1 + col] += 1\n",
+    "                    step_i = step_i - 1 + row\n",
+    "                    step_j = step_j - 1 + col\n",
+    "    return count"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 6min 14s, sys: 15.9 s, total: 6min 30s\n",
+      "Wall time: 6min 6s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "results = ridge_detection(data, 0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %timeit -r 7 -n 1 ridge_detection(data, 0)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/cuda_python_version.ipynb b/cuda_python_version.ipynb
@@ -0,0 +1,192 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download Example Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ! FILEID=\"1OO0tUguZMyQ1d37K7F9jiwV7mm_z2yuD\" && FILENAME=\"example_data.npy\" && wget --load-cookies /tmp/cookies.txt \"https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id='$FILEID -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\\1\\n/p')&id=$FILEID\" -O $FILENAME && rm -rf /tmp/cookies.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import Numba CUDA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from numba import cuda\n",
+    "import numpy as np\n",
+    "import math"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = np.load('example_data.npy')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Numba (CUDA Python)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@cuda.jit\n",
+    "def cuda_ridge_detection(f, count, thres):\n",
+    "    start_i, start_j = cuda.grid(2)\n",
+    "    stride_i, stride_j = cuda.gridsize(2)\n",
+    "    for i in range(start_i, f.shape[0], stride_i):\n",
+    "        for j in range(start_j, f.shape[1], stride_j):\n",
+    "            if (\n",
+    "                i > 0\n",
+    "                and j > 0\n",
+    "                and i < (f.shape[0] - 1)\n",
+    "                and j < (f.shape[1] - 1)\n",
+    "                and f[i, j] > thres\n",
+    "                and ~math.isnan(f[i, j])\n",
+    "            ):\n",
+    "                step_i = i\n",
+    "                step_j = j\n",
+    "                for k in range(1000):\n",
+    "                    if (\n",
+    "                        step_i == 0\n",
+    "                        or step_j == 0\n",
+    "                        or step_i == (f.shape[0] - 1)\n",
+    "                        or step_j == (f.shape[1] - 1)\n",
+    "                    ):\n",
+    "                        break\n",
+    "                    index = 4\n",
+    "                    vmax = -np.inf\n",
+    "                    for ii in range(3):\n",
+    "                        for jj in range(3):\n",
+    "                            if f[step_i + ii - 1, step_j + jj - 1] > vmax:\n",
+    "                                vmax = f[step_i + ii - 1, step_j + jj - 1]\n",
+    "                                index = jj + 3 * ii\n",
+    "                    if index == 4 or vmax == f[step_i, step_j] or math.isnan(vmax):\n",
+    "                        break\n",
+    "                    row = int(index / 3)\n",
+    "                    col = index % 3\n",
+    "                    cuda.atomic.add(count, (step_i - 1 + row, step_j - 1 + col), 1)\n",
+    "                    step_i = step_i - 1 + row\n",
+    "                    step_j = step_j - 1 + col"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def test_func(data):\n",
+    "    device_data = cuda.to_device(data)\n",
+    "    device_results = cuda.device_array_like(device_data)\n",
+    "    cuda_ridge_detection[(8, 8), (8, 32)](device_data, device_results, 0)\n",
+    "    cuda_results = device_results.copy_to_host()\n",
+    "    return cuda_results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cuda_results = test_func(data)\n",
+    "np.testing.assert_almost_equal(results, cuda_results)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.67 ms ± 8.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit -r 7 -n 1000 test_func(data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Speedup by 200,000+ times!!!!!\n",
+    "\n",
+    "- CPU-based solution cost 366 seconds (366000 ms)\n",
+    "- CUDA Python solution cost 0.00167 seconds (1.67 ms)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "219161.6766467066"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "366000 / 1.67"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}