{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "Done.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Library/Python/2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", " \"You should import from traitlets.config instead.\", ShimWarning)\n", "/Library/Python/2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" ] }, { "data": { "text/plain": [ "[]" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%load_ext sql\n", "%sql sqlite:///complaint.db\n", "import time\n", "%sql drop index if exists helpful_index;\n", "%sql analyze" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Let's play with the [consumer complaint database](https://catalog.data.gov/dataset/consumer-complaint-database) from data.gov" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", "
count(*)
818801
" ], "text/plain": [ "[(818801,)]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%sql select count(*) from complaints;" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Complaint_IDProductSubproductIssueSubissueStateZIP_codeSubmitted_viaDate_receivedDate_sent_to_companyCompanyCompany_responseTimely_responseConsumer_disputed
NoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNone
1431865Consumer loanVehicle loanManaging the loan or leaseNJ08736Web06/22/201506/22/2015Nissan Motor Acceptance CorporationIn progressYes
1431374Debt collectionMedicalDisclosure verification of debtNot given enough info to verify debtWI54140Web06/22/201506/22/2015RSI Enterprises, Inc.Closed with explanationYes
1431251MortgageConventional fixed mortgageLoan modification,collection,foreclosureMO63368Web06/22/201506/22/2015Flagstar BankIn progressYes
1431743Debt collectionMedicalCont'd attempts collect debt not owedDebt is not mineWA98055Web06/22/201506/22/2015P.S.C., IncClosed with explanationYesYes
" ], "text/plain": [ "[(None, None, None, None, None, None, None, None, None, None, None, None, None, None),\n", " (u'1431865', u'Consumer loan', u'Vehicle loan', u'Managing the loan or lease', u'', u'NJ', u'08736', u'Web', u'06/22/2015', u'06/22/2015', u'Nissan Motor Acceptance Corporation', u'In progress', u'Yes', u''),\n", " (u'1431374', u'Debt collection', u'Medical', u'Disclosure verification of debt', u'Not given enough info to verify debt', u'WI', u'54140', u'Web', u'06/22/2015', u'06/22/2015', u'RSI Enterprises, Inc.', u'Closed with explanation', u'Yes', u''),\n", " (u'1431251', u'Mortgage', u'Conventional fixed mortgage', u'Loan modification,collection,foreclosure', u'', u'MO', u'63368', u'Web', u'06/22/2015', u'06/22/2015', u'Flagstar Bank', u'In progress', u'Yes', u''),\n", " (u'1431743', u'Debt collection', u'Medical', u\"Cont'd attempts collect debt not owed\", u'Debt is not mine', u'WA', u'98055', u'Web', u'06/22/2015', u'06/22/2015', u'P.S.C., Inc', u'Closed with explanation', u'Yes', u'Yes')]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%sql select * from complaints limit 5;" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "The syntax to create a view is below\n", "> create index (index_name) on (table)(attributes)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false, "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "Done.\n" ] }, { "data": { "text/plain": [ "[]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql drop index if exists zip_index;\n", "create index zip_index on complaints(zip_code);" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ZIP_codec
9234
483821842
330711242
33173774
76116742
" ], "text/plain": [ "[(u'', 9234),\n", " (u'48382', 1842),\n", " (u'33071', 1242),\n", " (u'33173', 774),\n", " (u'76116', 742)]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "select zip_code, count(*) as c \n", "from complaints group by zip_code \n", "order by c desc limit 5;" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Productcount(*)
Bank account or service3
Credit card3
Credit reporting4
Debt collection2
Mortgage908
Student loan1
" ], "text/plain": [ "[(u'Bank account or service', 3),\n", " (u'Credit card', 3),\n", " (u'Credit reporting', 4),\n", " (u'Debt collection', 2),\n", " (u'Mortgage', 908),\n", " (u'Student loan', 1)]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql select product, count(*)\n", "from complaints c1 where ZIP_CODE = '48382'\n", "group by product;" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Let's get down to business... Indexing complainers!\n", "\n", "For pedagogical purposes, we'll use _likelihood_ function:\n", " * likelihood(X,Y) does not change X's value \n", " * instead it tells the DBMS the fraction of calls on which we believe X holds.\n", " * 0.0 is lowest, 1.0 is highest.\n", " * It's a form of _hint_ for the optimizer" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
selectidorderfromdetail
000SCAN TABLE complaints AS c1
000USE TEMP B-TREE FOR GROUP BY
" ], "text/plain": [ "[(0, 0, 0, u'SCAN TABLE complaints AS c1'),\n", " (0, 0, 0, u'USE TEMP B-TREE FOR GROUP BY')]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql \n", "explain query plan \n", "select product, count(*) from complaints c1 \n", "where likelihood(ZIP_CODE = '48382', 1.0) group by product;" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
selectidorderfromdetail
000SEARCH TABLE complaints AS c1 USING INDEX zip_index (ZIP_code=?)
000USE TEMP B-TREE FOR GROUP BY
" ], "text/plain": [ "[(0, 0, 0, u'SEARCH TABLE complaints AS c1 USING INDEX zip_index (ZIP_code=?)'),\n", " (0, 0, 0, u'USE TEMP B-TREE FOR GROUP BY')]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql \n", "explain query plan \n", "select product, count(*) from complaints c1 \n", "where likelihood(ZIP_CODE = '48382', 0.0) group by product;" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "So... does it make a difference in execution time? \n", "\n", "... Well, some it's a small database in memory..." ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "Done.\n", "time for scan=0.00245881080627 time for index=0.00234699249268\n", "\t So about 1.04764323446 times faster\n" ] } ], "source": [ "start_scan = time.time()\n", "%sql select product, count(*) from complaints c1 where likelihood(ZIP_CODE = '48382', 1.0) group by product;\n", "scan_time = time.time() - start_scan\n", "\n", "start_index = time.time()\n", "%sql select product, count(*) from complaints c1 where likelihood(ZIP_CODE = '48382', 0.0) group by product;\n", "index_time = time.time() - start_index\n", "print(\"time for scan={0} time for index={1}\".format(scan_time, index_time))\n", "print(\"\\t So about {0} times faster\".format(scan_time/index_time))" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "
Covering Indexes
" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "We say that an index is _covering_ if all the needed columns are in the index.\n", " * Needed columns is the union of those in WHERE _and_ SELECT.\n", " * Means that the query can be answere using the index--without reading the table!" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n", "Done.\n" ] }, { "data": { "text/plain": [ "[]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql drop index if exists helpful_index;\n", "create index helpful_index on \n", "complaints(zip_code, product, complaint_id);" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Which plan happens for this query?\n", "> select product, count(*)
\n", "> from complaints
\n", "> where likelihood(ZIP_CODE = '48382', 1.0)\n", ">
group by product;\n", "\n", "NB: Using zip_index, this would be a scan!" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false, "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
selectidorderfromdetail
000SEARCH TABLE complaints AS c1 USING COVERING INDEX helpful_index (ZIP_code=?)
" ], "text/plain": [ "[(0, 0, 0, u'SEARCH TABLE complaints AS c1 USING COVERING INDEX helpful_index (ZIP_code=?)')]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%sql\n", "explain query plan select product, count(*) \n", "from complaints c1 \n", "where likelihood(ZIP_CODE = '48382', 1.0) group by product;" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "* The index is _smaller_ in absolute data size.\n", "* The DBMS knows this, and it picks the right index!" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "Summary\n", "-------\n", "\n", "* This notebook walks through index creation \n", "* That index selection is done in a cost based way.\n", "* Introduced the notion of covering index.\n", "* Demonstrated that not only keys to answer query, but attributes returned important component of selection. " ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 0 }