{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"Done.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Library/Python/2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n",
" \"You should import from traitlets.config instead.\", ShimWarning)\n",
"/Library/Python/2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n",
" warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n"
]
},
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%load_ext sql\n",
"%sql sqlite:///complaint.db\n",
"import time\n",
"%sql drop index if exists helpful_index;\n",
"%sql analyze"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Let's play with the [consumer complaint database](https://catalog.data.gov/dataset/consumer-complaint-database) from data.gov"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n"
]
},
{
"data": {
"text/html": [
"
\n",
" \n",
" count(*) | \n",
"
\n",
" \n",
" 818801 | \n",
"
\n",
"
"
],
"text/plain": [
"[(818801,)]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql select count(*) from complaints;"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" Complaint_ID | \n",
" Product | \n",
" Subproduct | \n",
" Issue | \n",
" Subissue | \n",
" State | \n",
" ZIP_code | \n",
" Submitted_via | \n",
" Date_received | \n",
" Date_sent_to_company | \n",
" Company | \n",
" Company_response | \n",
" Timely_response | \n",
" Consumer_disputed | \n",
"
\n",
" \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
"
\n",
" \n",
" 1431865 | \n",
" Consumer loan | \n",
" Vehicle loan | \n",
" Managing the loan or lease | \n",
" | \n",
" NJ | \n",
" 08736 | \n",
" Web | \n",
" 06/22/2015 | \n",
" 06/22/2015 | \n",
" Nissan Motor Acceptance Corporation | \n",
" In progress | \n",
" Yes | \n",
" | \n",
"
\n",
" \n",
" 1431374 | \n",
" Debt collection | \n",
" Medical | \n",
" Disclosure verification of debt | \n",
" Not given enough info to verify debt | \n",
" WI | \n",
" 54140 | \n",
" Web | \n",
" 06/22/2015 | \n",
" 06/22/2015 | \n",
" RSI Enterprises, Inc. | \n",
" Closed with explanation | \n",
" Yes | \n",
" | \n",
"
\n",
" \n",
" 1431251 | \n",
" Mortgage | \n",
" Conventional fixed mortgage | \n",
" Loan modification,collection,foreclosure | \n",
" | \n",
" MO | \n",
" 63368 | \n",
" Web | \n",
" 06/22/2015 | \n",
" 06/22/2015 | \n",
" Flagstar Bank | \n",
" In progress | \n",
" Yes | \n",
" | \n",
"
\n",
" \n",
" 1431743 | \n",
" Debt collection | \n",
" Medical | \n",
" Cont'd attempts collect debt not owed | \n",
" Debt is not mine | \n",
" WA | \n",
" 98055 | \n",
" Web | \n",
" 06/22/2015 | \n",
" 06/22/2015 | \n",
" P.S.C., Inc | \n",
" Closed with explanation | \n",
" Yes | \n",
" Yes | \n",
"
\n",
"
"
],
"text/plain": [
"[(None, None, None, None, None, None, None, None, None, None, None, None, None, None),\n",
" (u'1431865', u'Consumer loan', u'Vehicle loan', u'Managing the loan or lease', u'', u'NJ', u'08736', u'Web', u'06/22/2015', u'06/22/2015', u'Nissan Motor Acceptance Corporation', u'In progress', u'Yes', u''),\n",
" (u'1431374', u'Debt collection', u'Medical', u'Disclosure verification of debt', u'Not given enough info to verify debt', u'WI', u'54140', u'Web', u'06/22/2015', u'06/22/2015', u'RSI Enterprises, Inc.', u'Closed with explanation', u'Yes', u''),\n",
" (u'1431251', u'Mortgage', u'Conventional fixed mortgage', u'Loan modification,collection,foreclosure', u'', u'MO', u'63368', u'Web', u'06/22/2015', u'06/22/2015', u'Flagstar Bank', u'In progress', u'Yes', u''),\n",
" (u'1431743', u'Debt collection', u'Medical', u\"Cont'd attempts collect debt not owed\", u'Debt is not mine', u'WA', u'98055', u'Web', u'06/22/2015', u'06/22/2015', u'P.S.C., Inc', u'Closed with explanation', u'Yes', u'Yes')]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql select * from complaints limit 5;"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"The syntax to create a view is below\n",
"> create index (index_name) on (table)(attributes)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "fragment"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"Done.\n"
]
},
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql drop index if exists zip_index;\n",
"create index zip_index on complaints(zip_code);"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" ZIP_code | \n",
" c | \n",
"
\n",
" \n",
" | \n",
" 9234 | \n",
"
\n",
" \n",
" 48382 | \n",
" 1842 | \n",
"
\n",
" \n",
" 33071 | \n",
" 1242 | \n",
"
\n",
" \n",
" 33173 | \n",
" 774 | \n",
"
\n",
" \n",
" 76116 | \n",
" 742 | \n",
"
\n",
"
"
],
"text/plain": [
"[(u'', 9234),\n",
" (u'48382', 1842),\n",
" (u'33071', 1242),\n",
" (u'33173', 774),\n",
" (u'76116', 742)]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"select zip_code, count(*) as c \n",
"from complaints group by zip_code \n",
"order by c desc limit 5;"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" Product | \n",
" count(*) | \n",
"
\n",
" \n",
" Bank account or service | \n",
" 3 | \n",
"
\n",
" \n",
" Credit card | \n",
" 3 | \n",
"
\n",
" \n",
" Credit reporting | \n",
" 4 | \n",
"
\n",
" \n",
" Debt collection | \n",
" 2 | \n",
"
\n",
" \n",
" Mortgage | \n",
" 908 | \n",
"
\n",
" \n",
" Student loan | \n",
" 1 | \n",
"
\n",
"
"
],
"text/plain": [
"[(u'Bank account or service', 3),\n",
" (u'Credit card', 3),\n",
" (u'Credit reporting', 4),\n",
" (u'Debt collection', 2),\n",
" (u'Mortgage', 908),\n",
" (u'Student loan', 1)]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql select product, count(*)\n",
"from complaints c1 where ZIP_CODE = '48382'\n",
"group by product;"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Let's get down to business... Indexing complainers!\n",
"\n",
"For pedagogical purposes, we'll use _likelihood_ function:\n",
" * likelihood(X,Y) does not change X's value \n",
" * instead it tells the DBMS the fraction of calls on which we believe X holds.\n",
" * 0.0 is lowest, 1.0 is highest.\n",
" * It's a form of _hint_ for the optimizer"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" selectid | \n",
" order | \n",
" from | \n",
" detail | \n",
"
\n",
" \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" SCAN TABLE complaints AS c1 | \n",
"
\n",
" \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" USE TEMP B-TREE FOR GROUP BY | \n",
"
\n",
"
"
],
"text/plain": [
"[(0, 0, 0, u'SCAN TABLE complaints AS c1'),\n",
" (0, 0, 0, u'USE TEMP B-TREE FOR GROUP BY')]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql \n",
"explain query plan \n",
"select product, count(*) from complaints c1 \n",
"where likelihood(ZIP_CODE = '48382', 1.0) group by product;"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" selectid | \n",
" order | \n",
" from | \n",
" detail | \n",
"
\n",
" \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" SEARCH TABLE complaints AS c1 USING INDEX zip_index (ZIP_code=?) | \n",
"
\n",
" \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" USE TEMP B-TREE FOR GROUP BY | \n",
"
\n",
"
"
],
"text/plain": [
"[(0, 0, 0, u'SEARCH TABLE complaints AS c1 USING INDEX zip_index (ZIP_code=?)'),\n",
" (0, 0, 0, u'USE TEMP B-TREE FOR GROUP BY')]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql \n",
"explain query plan \n",
"select product, count(*) from complaints c1 \n",
"where likelihood(ZIP_CODE = '48382', 0.0) group by product;"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"So... does it make a difference in execution time? \n",
"\n",
"... Well, some it's a small database in memory..."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"Done.\n",
"time for scan=0.00245881080627 time for index=0.00234699249268\n",
"\t So about 1.04764323446 times faster\n"
]
}
],
"source": [
"start_scan = time.time()\n",
"%sql select product, count(*) from complaints c1 where likelihood(ZIP_CODE = '48382', 1.0) group by product;\n",
"scan_time = time.time() - start_scan\n",
"\n",
"start_index = time.time()\n",
"%sql select product, count(*) from complaints c1 where likelihood(ZIP_CODE = '48382', 0.0) group by product;\n",
"index_time = time.time() - start_index\n",
"print(\"time for scan={0} time for index={1}\".format(scan_time, index_time))\n",
"print(\"\\t So about {0} times faster\".format(scan_time/index_time))"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Covering Indexes"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"We say that an index is _covering_ if all the needed columns are in the index.\n",
" * Needed columns is the union of those in WHERE _and_ SELECT.\n",
" * Means that the query can be answere using the index--without reading the table!"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n",
"Done.\n"
]
},
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql drop index if exists helpful_index;\n",
"create index helpful_index on \n",
"complaints(zip_code, product, complaint_id);"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Which plan happens for this query?\n",
"> select product, count(*)
\n",
"> from complaints
\n",
"> where likelihood(ZIP_CODE = '48382', 1.0)\n",
">
group by product;\n",
"\n",
"NB: Using zip_index, this would be a scan!"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false,
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" selectid | \n",
" order | \n",
" from | \n",
" detail | \n",
"
\n",
" \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" SEARCH TABLE complaints AS c1 USING COVERING INDEX helpful_index (ZIP_code=?) | \n",
"
\n",
"
"
],
"text/plain": [
"[(0, 0, 0, u'SEARCH TABLE complaints AS c1 USING COVERING INDEX helpful_index (ZIP_code=?)')]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%sql\n",
"explain query plan select product, count(*) \n",
"from complaints c1 \n",
"where likelihood(ZIP_CODE = '48382', 1.0) group by product;"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"* The index is _smaller_ in absolute data size.\n",
"* The DBMS knows this, and it picks the right index!"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"Summary\n",
"-------\n",
"\n",
"* This notebook walks through index creation \n",
"* That index selection is done in a cost based way.\n",
"* Introduced the notion of covering index.\n",
"* Demonstrated that not only keys to answer query, but attributes returned important component of selection. "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 0
}