diff --git a/README.md b/README.md index 2fc95ab..d0b4ec0 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ # awesome-pandas A collection of resources for [pandas](http://pandas.pydata.org/) ([Python](https://www.python.org/)) and related subjects. -**Pull requests are very welcome.** +**Pull requests are very welcome!** **Contents:** This is an unofficial collection of resources for learning pandas, an open source Python library for data analysis. Here you will find videos, diff --git a/cheat_sheet/Cheat Sheet.ipynb b/cheat_sheet/Cheat Sheet.ipynb index 72ecffc..83bc104 100644 --- a/cheat_sheet/Cheat Sheet.ipynb +++ b/cheat_sheet/Cheat Sheet.ipynb @@ -2,7 +2,9 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "raw_mimetype": "-" + }, "source": [ "# Pandas Cheat Sheet" ] @@ -81,10 +83,16 @@ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import matplotlib\n", - "\n", "%matplotlib inline" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To make this Jupyter Notebook reproducible, here are the versions of the libraries we will be using." + ] + }, { "cell_type": "code", "execution_count": 2, @@ -95,8 +103,8 @@ "output_type": "stream", "text": [ "pandas version 0.22.0\n", - "numpy version 1.13.3\n", - "matplotlib version 2.1.1\n" + "numpy version 1.14.2\n", + "matplotlib version 2.2.2\n" ] } ], @@ -105,58 +113,26 @@ " print(f'{lib.__name__.ljust(12)} version {lib.__version__}')" ] }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "pd.set_option(\"display.max_rows\", 2**6)\n", - "pd.set_option(\"display.max_columns\", 2**6)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "64" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "2**6" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Importing data\n", "\n", - "We'll use" + "Using `!` let's us use terminal commands. The `head` command shows the first rows of the file." ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,actor_1_name,movie_title,num_voted_users,cast_total_facebook_likes,actor_3_name,facenumber_in_poster,plot_keywords,movie_imdb_link,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes\r", - "\r\n", - "Color,James Cameron,723,178,0,855,Joel David Moore,1000,760505847,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar ,886204,4834,Wes Studi,0,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1,3054,English,USA,PG-13,237000000,2009,936,7.9,1.78,33000\r", - "\r\n" + "color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,actor_1_name,movie_title,num_voted_users,cast_total_facebook_likes,actor_3_name,facenumber_in_poster,plot_keywords,movie_imdb_link,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes\n", + "Color,James Cameron,723,178,0,855,Joel David Moore,1000,760505847,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar ,886204,4834,Wes Studi,0,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1,3054,English,USA,PG-13,237000000,2009,936,7.9,1.78,33000\n" ] } ], @@ -164,25 +140,43 @@ "!head data/movie_metadata.csv -n 2" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's a huge file, so we'll only load a couple of columns into a pandas DataFrame.\n", + "To familiarize ourselves with with [magic commands](http://ipython.readthedocs.io/en/stable/interactive/magics.html), we'll use `%%time` to time the execution of the cell below." + ] + }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "(5043, 28)" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded data of size (5043, 6) into memory.\n", + "CPU times: user 44 ms, sys: 525 µs, total: 44.5 ms\n", + "Wall time: 42.9 ms\n" + ] } ], "source": [ - "df = pd.read_csv(r'data/movie_metadata.csv', sep = ',')\n", - "df.shape" + "%%time\n", + "\n", + "cols_to_use = ['movie_title', 'director_name', 'country', 'content_rating', 'imdb_score', 'gross']\n", + "df = pd.read_csv(r'data/movie_metadata.csv', sep=',', usecols=cols_to_use)\n", + "print(f'Loaded data of size {df.shape} into memory.')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The df.shape gives the rows and columns of the DataFrame. \n", + "This leads us naturally to consider summarizations." ] }, { @@ -192,6 +186,132 @@ "# Summarizing data" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are three methods that are useful to peek at the data, they are df.head, df.tail and df.sample.\n", + "Head and tail are $\\mathcal{O}(1)$ operations, while sample is $\\mathcal{O}(n)$, where $n$ is the number of rows.\n", + "For small datasets, this makes no difference in practice. We'll use df.sample here." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
director_namegrossmovie_titlecountrycontent_ratingimdb_score
3097Darren Lynn Bousman63270259.0Saw IVUSAR5.9
1999Roman PolanskiNaNCarnageFranceR7.2
\n", + "
" + ], + "text/plain": [ + " director_name gross movie_title country content_rating \\\n", + "3097 Darren Lynn Bousman 63270259.0 Saw IV  USA R \n", + "1999 Roman Polanski NaN Carnage  France R \n", + "\n", + " imdb_score \n", + "3097 5.9 \n", + "1999 7.2 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.sample(n=2, replace=False, weights=None, random_state=None)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We should make sure the data types are correct. To do so, we can use df.dtypes, or df.info() for some more information." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 5043 entries, 0 to 5042\n", + "Data columns (total 6 columns):\n", + "director_name 4939 non-null object\n", + "gross 4159 non-null float64\n", + "movie_title 5043 non-null object\n", + "country 5038 non-null object\n", + "content_rating 4740 non-null object\n", + "imdb_score 5043 non-null float64\n", + "dtypes: float64(2), object(4)\n", + "memory usage: 236.5+ KB\n" + ] + } + ], + "source": [ + "df.info(verbose=True, memory_usage=True, null_counts=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have some null values. Let's count them by chaining df.isnull() and df.sum()." + ] + }, { "cell_type": "code", "execution_count": 7, @@ -200,7 +320,13 @@ { "data": { "text/plain": [ - "(5043, 28)" + "director_name 104\n", + "gross 884\n", + "movie_title 0\n", + "country 5\n", + "content_rating 303\n", + "imdb_score 0\n", + "dtype: int64" ] }, "execution_count": 7, @@ -209,7 +335,15 @@ } ], "source": [ - "df.shape" + "null_values = df.isnull().sum()\n", + "null_values" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The result of the above is not a DataFrame, but a Series." ] }, { @@ -220,17 +354,7 @@ { "data": { "text/plain": [ - "color object\n", - "director_name object\n", - "num_critic_for_reviews float64\n", - "duration float64\n", - "director_facebook_likes float64\n", - "actor_3_facebook_likes float64\n", - "actor_2_name object\n", - "actor_1_facebook_likes float64\n", - "gross float64\n", - "genres object\n", - "dtype: object" + "pandas.core.series.Series" ] }, "execution_count": 8, @@ -239,7 +363,14 @@ } ], "source": [ - "df.dtypes.head(10)" + "type(null_values)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can make the output prettier by converting null_values to a DataFrame using to_frame(), then transposing using .T, and finally renaming the first index." ] }, { @@ -268,149 +399,22 @@ " \n", " \n", " \n", - " 0\n", - " 1\n", + " director_name\n", + " gross\n", + " movie_title\n", + " country\n", + " content_rating\n", + " imdb_score\n", " \n", " \n", " \n", " \n", - " color\n", - " Color\n", - " Color\n", - " \n", - " \n", - " director_name\n", - " James Cameron\n", - " Gore Verbinski\n", - " \n", - " \n", - " num_critic_for_reviews\n", - " 723\n", - " 302\n", - " \n", - " \n", - " duration\n", - " 178\n", - " 169\n", - " \n", - " \n", - " director_facebook_likes\n", + " Missing values\n", + " 104\n", + " 884\n", " 0\n", - " 563\n", - " \n", - " \n", - " actor_3_facebook_likes\n", - " 855\n", - " 1000\n", - " \n", - " \n", - " actor_2_name\n", - " Joel David Moore\n", - " Orlando Bloom\n", - " \n", - " \n", - " actor_1_facebook_likes\n", - " 1000\n", - " 40000\n", - " \n", - " \n", - " gross\n", - " 7.60506e+08\n", - " 3.09404e+08\n", - " \n", - " \n", - " genres\n", - " Action|Adventure|Fantasy|Sci-Fi\n", - " Action|Adventure|Fantasy\n", - " \n", - " \n", - " actor_1_name\n", - " CCH Pounder\n", - " Johnny Depp\n", - " \n", - " \n", - " movie_title\n", - " Avatar\n", - " Pirates of the Caribbean: At World's End\n", - " \n", - " \n", - " num_voted_users\n", - " 886204\n", - " 471220\n", - " \n", - " \n", - " cast_total_facebook_likes\n", - " 4834\n", - " 48350\n", - " \n", - " \n", - " actor_3_name\n", - " Wes Studi\n", - " Jack Davenport\n", - " \n", - " \n", - " facenumber_in_poster\n", - " 0\n", - " 0\n", - " \n", - " \n", - " plot_keywords\n", - " avatar|future|marine|native|paraplegic\n", - " goddess|marriage ceremony|marriage proposal|pi...\n", - " \n", - " \n", - " movie_imdb_link\n", - " http://www.imdb.com/title/tt0499549/?ref_=fn_t...\n", - " http://www.imdb.com/title/tt0449088/?ref_=fn_t...\n", - " \n", - " \n", - " num_user_for_reviews\n", - " 3054\n", - " 1238\n", - " \n", - " \n", - " language\n", - " English\n", - " English\n", - " \n", - " \n", - " country\n", - " USA\n", - " USA\n", - " \n", - " \n", - " content_rating\n", - " PG-13\n", - " PG-13\n", - " \n", - " \n", - " budget\n", - " 2.37e+08\n", - " 3e+08\n", - " \n", - " \n", - " title_year\n", - " 2009\n", - " 2007\n", - " \n", - " \n", - " actor_2_facebook_likes\n", - " 936\n", - " 5000\n", - " \n", - " \n", - " imdb_score\n", - " 7.9\n", - " 7.1\n", - " \n", - " \n", - " aspect_ratio\n", - " 1.78\n", - " 2.35\n", - " \n", - " \n", - " movie_facebook_likes\n", - " 33000\n", + " 5\n", + " 303\n", " 0\n", " \n", " \n", @@ -418,65 +422,11 @@ "" ], "text/plain": [ - " 0 \\\n", - "color Color \n", - "director_name James Cameron \n", - "num_critic_for_reviews 723 \n", - "duration 178 \n", - "director_facebook_likes 0 \n", - "actor_3_facebook_likes 855 \n", - "actor_2_name Joel David Moore \n", - "actor_1_facebook_likes 1000 \n", - "gross 7.60506e+08 \n", - "genres Action|Adventure|Fantasy|Sci-Fi \n", - "actor_1_name CCH Pounder \n", - "movie_title Avatar  \n", - "num_voted_users 886204 \n", - "cast_total_facebook_likes 4834 \n", - "actor_3_name Wes Studi \n", - "facenumber_in_poster 0 \n", - "plot_keywords avatar|future|marine|native|paraplegic \n", - "movie_imdb_link http://www.imdb.com/title/tt0499549/?ref_=fn_t... \n", - "num_user_for_reviews 3054 \n", - "language English \n", - "country USA \n", - "content_rating PG-13 \n", - "budget 2.37e+08 \n", - "title_year 2009 \n", - "actor_2_facebook_likes 936 \n", - "imdb_score 7.9 \n", - "aspect_ratio 1.78 \n", - "movie_facebook_likes 33000 \n", + " director_name gross movie_title country content_rating \\\n", + "Missing values 104 884 0 5 303 \n", "\n", - " 1 \n", - "color Color \n", - "director_name Gore Verbinski \n", - "num_critic_for_reviews 302 \n", - "duration 169 \n", - "director_facebook_likes 563 \n", - "actor_3_facebook_likes 1000 \n", - "actor_2_name Orlando Bloom \n", - "actor_1_facebook_likes 40000 \n", - "gross 3.09404e+08 \n", - "genres Action|Adventure|Fantasy \n", - "actor_1_name Johnny Depp \n", - "movie_title Pirates of the Caribbean: At World's End  \n", - "num_voted_users 471220 \n", - "cast_total_facebook_likes 48350 \n", - "actor_3_name Jack Davenport \n", - "facenumber_in_poster 0 \n", - "plot_keywords goddess|marriage ceremony|marriage proposal|pi... \n", - "movie_imdb_link http://www.imdb.com/title/tt0449088/?ref_=fn_t... \n", - "num_user_for_reviews 1238 \n", - "language English \n", - "country USA \n", - "content_rating PG-13 \n", - "budget 3e+08 \n", - "title_year 2007 \n", - "actor_2_facebook_likes 5000 \n", - "imdb_score 7.1 \n", - "aspect_ratio 2.35 \n", - "movie_facebook_likes 0 " + " imdb_score \n", + "Missing values 0 " ] }, "execution_count": 9, @@ -485,7 +435,14 @@ } ], "source": [ - "df.head(2).T" + "null_values.to_frame().T.rename(index={0:'Missing values'})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above is called method chaining, and can be written like so:" ] }, { @@ -495,36 +452,53 @@ "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
director_namegrossmovie_titlecountrycontent_ratingimdb_score
Missing values104884053030
\n", + "
" + ], "text/plain": [ - "cast_total_facebook_likes 0\n", - "imdb_score 0\n", - "movie_imdb_link 0\n", - "num_voted_users 0\n", - "movie_title 0\n", - "genres 0\n", - "movie_facebook_likes 0\n", - "country 5\n", - "actor_1_facebook_likes 7\n", - "actor_1_name 7\n", - "language 12\n", - "actor_2_facebook_likes 13\n", - "actor_2_name 13\n", - "facenumber_in_poster 13\n", - "duration 15\n", - "color 19\n", - "num_user_for_reviews 21\n", - "actor_3_facebook_likes 23\n", - "actor_3_name 23\n", - "num_critic_for_reviews 50\n", - "director_facebook_likes 104\n", - "director_name 104\n", - "title_year 108\n", - "plot_keywords 153\n", - "content_rating 303\n", - "aspect_ratio 329\n", - "budget 492\n", - "gross 884\n", - "dtype: int64" + " director_name gross movie_title country content_rating \\\n", + "Missing values 104 884 0 5 303 \n", + "\n", + " imdb_score \n", + "Missing values 0 " ] }, "execution_count": 10, @@ -533,13 +507,274 @@ } ], "source": [ - "df.isnull().sum().sort_values()" + "(df\n", + " .isnull() # Figure out whether every entry is null (missing), or not\n", + " .sum(axis=0) # Sum over each column, axis=0 is the default\n", + " .to_frame() # The result is a Series, convert to DataFrame\n", + " .T # Transpose (switch rows and columns)\n", + " .rename(index={0:'Missing values'}) # Rename the index and show it\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A tour of summarization would not be completed without df.describe().\n", + "Calling df.count(), df.nunique(), df.mean(), df.std(), df.min(), df.quantile(), df.max() is also possible." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
director_namegrossmovie_titlecountrycontent_ratingimdb_score
count493941595043503847405043
unique239849176518
topSteven SpielbergKing KongUSAR
freq26338072118
mean4.84684e+076.44214
std6.8453e+071.12512
min1621.6
50%2.55175e+076.6
max7.60506e+089.5
\n", + "
" + ], + "text/plain": [ + " director_name gross movie_title country content_rating \\\n", + "count 4939 4159 5043 5038 4740 \n", + "unique 2398 4917 65 18 \n", + "top Steven Spielberg King Kong  USA R \n", + "freq 26 3 3807 2118 \n", + "mean 4.84684e+07 \n", + "std 6.8453e+07 \n", + "min 162 \n", + "50% 2.55175e+07 \n", + "max 7.60506e+08 \n", + "\n", + " imdb_score \n", + "count 5043 \n", + "unique \n", + "top \n", + "freq \n", + "mean 6.44214 \n", + "std 1.12512 \n", + "min 1.6 \n", + "50% 6.6 \n", + "max 9.5 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe(percentiles=[0.5], include='all').fillna('')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualizations" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
grossimdb_score
gross1.0000000.096247
imdb_score0.0962471.000000
\n", + "
" + ], + "text/plain": [ + " gross imdb_score\n", + "gross 1.000000 0.096247\n", + "imdb_score 0.096247 1.000000" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.corr(method='spearman', min_periods=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot = pd.plotting.scatter_matrix(df, alpha=0.5, figsize=(8,5))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Honorable mentions are len(df)." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, "outputs": [], "source": [ "# len(df), shape, value_counts, head, tail, max(), min(), mean, dtype, info(), \n", @@ -555,7 +790,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -565,7 +800,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -637,7 +872,7 @@ "2 PG-13 6.8 " ] }, - "execution_count": 13, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -649,7 +884,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -721,7 +956,7 @@ "102 PG-13 7.8 " ] }, - "execution_count": 14, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -732,9 +967,29 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/tommy/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: FutureWarning: \n", + "Passing list-likes to .loc or [] with any missing label will raise\n", + "KeyError in the future, you can use .reindex() as an alternative.\n", + "\n", + "See the documentation here:\n", + "http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike\n", + " \"\"\"Entry point for launching an IPython kernel.\n", + "/home/tommy/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py:1367: FutureWarning: \n", + "Passing list-likes to .loc or [] with any missing label will raise\n", + "KeyError in the future, you can use .reindex() as an alternative.\n", + "\n", + "See the documentation here:\n", + "http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike\n", + " return self._getitem_tuple(key)\n" + ] + }, { "data": { "text/html": [ @@ -773,7 +1028,7 @@ " USA\n", " PG-13\n", " 7.9\n", - " 0.0\n", + " NaN\n", " 760505847.0\n", " \n", " \n", @@ -783,7 +1038,7 @@ " USA\n", " PG-13\n", " 7.1\n", - " 563.0\n", + " NaN\n", " 309404152.0\n", " \n", " \n", @@ -796,11 +1051,11 @@ "1 Pirates of the Caribbean: At World's End  Gore Verbinski USA \n", "\n", " content_rating imdb_score director_facebook_likes gross \n", - "0 PG-13 7.9 0.0 760505847.0 \n", - "1 PG-13 7.1 563.0 309404152.0 " + "0 PG-13 7.9 NaN 760505847.0 \n", + "1 PG-13 7.1 NaN 309404152.0 " ] }, - "execution_count": 15, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -812,7 +1067,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -877,7 +1132,7 @@ "1 PG-13 7.1 309404152.0 " ] }, - "execution_count": 16, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -889,7 +1144,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -965,7 +1220,7 @@ "3466 R 9.2 134821952.0 " ] }, - "execution_count": 17, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -976,7 +1231,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -985,7 +1240,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1065,7 +1320,7 @@ "max 10 760,505,847 20" ] }, - "execution_count": 19, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1076,7 +1331,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -1135,7 +1390,7 @@ "gross_log 0.074280 0.616034 1.000000" ] }, - "execution_count": 20, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1153,7 +1408,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -1233,7 +1488,7 @@ "9 PG 7.5 301956980.0 19.525795 " ] }, - "execution_count": 21, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -1244,7 +1499,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -1324,7 +1579,7 @@ "4029 Brazil R 8.7 7563397.0 15.838831 " ] }, - "execution_count": 22, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1336,7 +1591,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -1346,7 +1601,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -1450,7 +1705,7 @@ "1196 USA R 7.8 102310175.0 18.443520 " ] }, - "execution_count": 24, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -1461,7 +1716,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -1475,7 +1730,7 @@ "Name: country, dtype: int64" ] }, - "execution_count": 25, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1486,7 +1741,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -1500,7 +1755,7 @@ "Name: country, dtype: int64" ] }, - "execution_count": 26, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -1517,10 +1772,8 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ "- **Split-apply-combine and pivots**: groupby, dt.month, dt.year, groupby.mean(), agg, stack, unstack, pivot, melt, merge" ] @@ -1534,7 +1787,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -1600,7 +1853,7 @@ "Ridley Scott 16" ] }, - "execution_count": 36, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -1611,7 +1864,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -1677,7 +1930,7 @@ "Charles Chaplin 8.6" ] }, - "execution_count": 41, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -1688,7 +1941,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -1792,7 +2045,7 @@ "4 NaN NaN 7.1 NaN NaN " ] }, - "execution_count": 43, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -1803,7 +2056,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -1869,7 +2122,7 @@ "Aaron Schneider 7.1 16.032162 1" ] }, - "execution_count": 49, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -1882,7 +2135,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -1948,7 +2201,7 @@ "Aaron Schneider 0.771125 -0.321155 -0.488133" ] }, - "execution_count": 69, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -1960,7 +2213,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -2031,7 +2284,7 @@ "Clint Eastwood 0.883067 8.041961 8.705881 17.630909" ] }, - "execution_count": 71, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -2047,9 +2300,22 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 36, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/tommy/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py:858: FutureWarning: \n", + "Passing list-likes to .loc or [] with any missing label will raise\n", + "KeyError in the future, you can use .reindex() as an alternative.\n", + "\n", + "See the documentation here:\n", + "http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike\n", + " return self._getitem_lowerdim(tup)\n" + ] + }, { "data": { "text/html": [ @@ -2071,155 +2337,23 @@ " \n", " \n", " content_rating\n", - " Approved\n", - " G\n", - " GP\n", - " M\n", - " NC-17\n", - " Not Rated\n", - " PG\n", - " PG-13\n", - " Passed\n", - " R\n", - " TV-14\n", - " TV-G\n", - " TV-PG\n", - " Unrated\n", - " X\n", " \n", " \n", " title_year\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", - " \n", - " 2012.0\n", - " 0.0\n", - " 2.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 18.0\n", - " 26.0\n", - " 71.0\n", - " 0.0\n", - " 84.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", - " \n", - " 2013.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 14.0\n", - " 26.0\n", - " 68.0\n", - " 0.0\n", - " 93.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " \n", - " \n", - " 2014.0\n", - " 0.0\n", - " 3.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 9.0\n", - " 28.0\n", - " 72.0\n", - " 0.0\n", - " 86.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 5.0\n", - " 0.0\n", - " \n", - " \n", - " 2015.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 5.0\n", - " 22.0\n", - " 59.0\n", - " 0.0\n", - " 77.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " \n", - " \n", - " 2016.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 2.0\n", - " 11.0\n", - " 41.0\n", - " 0.0\n", - " 31.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " \n", " \n", "\n", "" ], "text/plain": [ - "content_rating Approved G GP M NC-17 Not Rated PG PG-13 \\\n", - "title_year \n", - "2012.0 0.0 2.0 0.0 0.0 0.0 18.0 26.0 71.0 \n", - "2013.0 0.0 1.0 0.0 0.0 0.0 14.0 26.0 68.0 \n", - "2014.0 0.0 3.0 0.0 0.0 0.0 9.0 28.0 72.0 \n", - "2015.0 0.0 1.0 0.0 0.0 0.0 5.0 22.0 59.0 \n", - "2016.0 0.0 0.0 0.0 0.0 0.0 2.0 11.0 41.0 \n", - "\n", - "content_rating Passed R TV-14 TV-G TV-PG Unrated X \n", - "title_year \n", - "2012.0 0.0 84.0 0.0 1.0 0.0 0.0 0.0 \n", - "2013.0 0.0 93.0 1.0 0.0 0.0 1.0 0.0 \n", - "2014.0 0.0 86.0 0.0 0.0 0.0 5.0 0.0 \n", - "2015.0 0.0 77.0 1.0 0.0 0.0 1.0 0.0 \n", - "2016.0 0.0 31.0 0.0 0.0 0.0 0.0 0.0 " + "Empty DataFrame\n", + "Columns: []\n", + "Index: []" ] }, - "execution_count": 82, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -2238,7 +2372,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -2262,182 +2396,59 @@ " \n", " \n", " \n", - " color\n", " director_name\n", - " num_critic_for_reviews\n", - " duration\n", - " director_facebook_likes\n", - " actor_3_facebook_likes\n", - " actor_2_name\n", - " actor_1_facebook_likes\n", " gross\n", - " genres\n", - " actor_1_name\n", " movie_title\n", - " num_voted_users\n", - " cast_total_facebook_likes\n", - " actor_3_name\n", - " facenumber_in_poster\n", - " plot_keywords\n", - " movie_imdb_link\n", - " num_user_for_reviews\n", - " language\n", " country\n", " content_rating\n", - " budget\n", - " title_year\n", - " actor_2_facebook_likes\n", " imdb_score\n", - " aspect_ratio\n", - " movie_facebook_likes\n", " \n", " \n", " \n", " \n", " 0\n", - " Color\n", " James Cameron\n", - " 723.0\n", - " 178.0\n", - " 0.0\n", - " 855.0\n", - " Joel David Moore\n", - " 1000.0\n", " 760505847.0\n", - " Action|Adventure|Fantasy|Sci-Fi\n", - " CCH Pounder\n", " Avatar\n", - " 886204\n", - " 4834\n", - " Wes Studi\n", - " 0.0\n", - " avatar|future|marine|native|paraplegic\n", - " http://www.imdb.com/title/tt0499549/?ref_=fn_t...\n", - " 3054.0\n", - " English\n", " USA\n", " PG-13\n", - " 237000000.0\n", - " 2009.0\n", - " 936.0\n", " 7.9\n", - " 1.78\n", - " 33000\n", " \n", " \n", " 1\n", - " Color\n", " Gore Verbinski\n", - " 302.0\n", - " 169.0\n", - " 563.0\n", - " 1000.0\n", - " Orlando Bloom\n", - " 40000.0\n", " 309404152.0\n", - " Action|Adventure|Fantasy\n", - " Johnny Depp\n", " Pirates of the Caribbean: At World's End\n", - " 471220\n", - " 48350\n", - " Jack Davenport\n", - " 0.0\n", - " goddess|marriage ceremony|marriage proposal|pi...\n", - " http://www.imdb.com/title/tt0449088/?ref_=fn_t...\n", - " 1238.0\n", - " English\n", " USA\n", " PG-13\n", - " 300000000.0\n", - " 2007.0\n", - " 5000.0\n", " 7.1\n", - " 2.35\n", - " 0\n", " \n", " \n", " 2\n", - " Color\n", " Sam Mendes\n", - " 602.0\n", - " 148.0\n", - " 0.0\n", - " 161.0\n", - " Rory Kinnear\n", - " 11000.0\n", " 200074175.0\n", - " Action|Adventure|Thriller\n", - " Christoph Waltz\n", " Spectre\n", - " 275868\n", - " 11700\n", - " Stephanie Sigman\n", - " 1.0\n", - " bomb|espionage|sequel|spy|terrorist\n", - " http://www.imdb.com/title/tt2379713/?ref_=fn_t...\n", - " 994.0\n", - " English\n", " UK\n", " PG-13\n", - " 245000000.0\n", - " 2015.0\n", - " 393.0\n", " 6.8\n", - " 2.35\n", - " 85000\n", " \n", " \n", "\n", "" ], "text/plain": [ - " color director_name num_critic_for_reviews duration \\\n", - "0 Color James Cameron 723.0 178.0 \n", - "1 Color Gore Verbinski 302.0 169.0 \n", - "2 Color Sam Mendes 602.0 148.0 \n", + " director_name gross movie_title \\\n", + "0 James Cameron 760505847.0 Avatar  \n", + "1 Gore Verbinski 309404152.0 Pirates of the Caribbean: At World's End  \n", + "2 Sam Mendes 200074175.0 Spectre  \n", "\n", - " director_facebook_likes actor_3_facebook_likes actor_2_name \\\n", - "0 0.0 855.0 Joel David Moore \n", - "1 563.0 1000.0 Orlando Bloom \n", - "2 0.0 161.0 Rory Kinnear \n", - "\n", - " actor_1_facebook_likes gross genres \\\n", - "0 1000.0 760505847.0 Action|Adventure|Fantasy|Sci-Fi \n", - "1 40000.0 309404152.0 Action|Adventure|Fantasy \n", - "2 11000.0 200074175.0 Action|Adventure|Thriller \n", - "\n", - " actor_1_name movie_title \\\n", - "0 CCH Pounder Avatar  \n", - "1 Johnny Depp Pirates of the Caribbean: At World's End  \n", - "2 Christoph Waltz Spectre  \n", - "\n", - " num_voted_users cast_total_facebook_likes actor_3_name \\\n", - "0 886204 4834 Wes Studi \n", - "1 471220 48350 Jack Davenport \n", - "2 275868 11700 Stephanie Sigman \n", - "\n", - " facenumber_in_poster plot_keywords \\\n", - "0 0.0 avatar|future|marine|native|paraplegic \n", - "1 0.0 goddess|marriage ceremony|marriage proposal|pi... \n", - "2 1.0 bomb|espionage|sequel|spy|terrorist \n", - "\n", - " movie_imdb_link num_user_for_reviews \\\n", - "0 http://www.imdb.com/title/tt0499549/?ref_=fn_t... 3054.0 \n", - "1 http://www.imdb.com/title/tt0449088/?ref_=fn_t... 1238.0 \n", - "2 http://www.imdb.com/title/tt2379713/?ref_=fn_t... 994.0 \n", - "\n", - " language country content_rating budget title_year \\\n", - "0 English USA PG-13 237000000.0 2009.0 \n", - "1 English USA PG-13 300000000.0 2007.0 \n", - "2 English UK PG-13 245000000.0 2015.0 \n", - "\n", - " actor_2_facebook_likes imdb_score aspect_ratio movie_facebook_likes \n", - "0 936.0 7.9 1.78 33000 \n", - "1 5000.0 7.1 2.35 0 \n", - "2 393.0 6.8 2.35 85000 " + " country content_rating imdb_score \n", + "0 USA PG-13 7.9 \n", + "1 USA PG-13 7.1 \n", + "2 UK PG-13 6.8 " ] }, - "execution_count": 83, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -2448,181 +2459,25 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 38, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
content_ratingApprovedGGPMNC-17Not RatedPGPG-13PassedRTV-14TV-GTV-PGUnratedX
title_year
2012.00.02.00.00.00.018.026.071.00.084.00.01.00.00.00.0
2013.00.01.00.00.00.014.026.068.00.093.01.00.00.01.00.0
2014.00.03.00.00.00.09.028.072.00.086.00.00.00.05.00.0
2015.00.01.00.00.00.05.022.059.00.077.01.00.00.01.00.0
2016.00.00.00.00.00.02.011.041.00.031.00.00.00.00.00.0
\n", - "
" - ], - "text/plain": [ - "content_rating Approved G GP M NC-17 Not Rated PG PG-13 \\\n", - "title_year \n", - "2012.0 0.0 2.0 0.0 0.0 0.0 18.0 26.0 71.0 \n", - "2013.0 0.0 1.0 0.0 0.0 0.0 14.0 26.0 68.0 \n", - "2014.0 0.0 3.0 0.0 0.0 0.0 9.0 28.0 72.0 \n", - "2015.0 0.0 1.0 0.0 0.0 0.0 5.0 22.0 59.0 \n", - "2016.0 0.0 0.0 0.0 0.0 0.0 2.0 11.0 41.0 \n", - "\n", - "content_rating Passed R TV-14 TV-G TV-PG Unrated X \n", - "title_year \n", - "2012.0 0.0 84.0 0.0 1.0 0.0 0.0 0.0 \n", - "2013.0 0.0 93.0 1.0 0.0 0.0 1.0 0.0 \n", - "2014.0 0.0 86.0 0.0 0.0 0.0 5.0 0.0 \n", - "2015.0 0.0 77.0 1.0 0.0 0.0 1.0 0.0 \n", - "2016.0 0.0 31.0 0.0 0.0 0.0 0.0 0.0 " - ] - }, - "execution_count": 90, - "metadata": {}, - "output_type": "execute_result" + "ename": "KeyError", + "evalue": "'title_year'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'title_year'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'content_rating'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m aggfunc=pd.DataFrame.nunique)\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m .tail(5))\n", + "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mpivot_table\u001b[0;34m(self, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name)\u001b[0m\n\u001b[1;32m 4466\u001b[0m \u001b[0maggfunc\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maggfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfill_value\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfill_value\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4467\u001b[0m \u001b[0mmargins\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmargins\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdropna\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdropna\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4468\u001b[0;31m margins_name=margins_name)\n\u001b[0m\u001b[1;32m 4469\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4470\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdropna\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/pivot.py\u001b[0m in \u001b[0;36mpivot_table\u001b[0;34m(data, values, index, columns, aggfunc, fill_value, margins, dropna, margins_name)\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 81\u001b[0;31m \u001b[0mgrouped\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 82\u001b[0m \u001b[0magged\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgrouped\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0magg\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maggfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36mgroupby\u001b[0;34m(self, by, axis, level, as_index, sort, group_keys, squeeze, **kwargs)\u001b[0m\n\u001b[1;32m 5160\u001b[0m return groupby(self, by=by, axis=axis, level=level, as_index=as_index,\n\u001b[1;32m 5161\u001b[0m \u001b[0msort\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msort\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgroup_keys\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mgroup_keys\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msqueeze\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msqueeze\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5162\u001b[0;31m **kwargs)\n\u001b[0m\u001b[1;32m 5163\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5164\u001b[0m def asfreq(self, freq, method=None, how=None, normalize=False,\n", + "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby.py\u001b[0m in \u001b[0;36mgroupby\u001b[0;34m(obj, by, **kwds)\u001b[0m\n\u001b[1;32m 1846\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'invalid type: %s'\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1847\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1848\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mklass\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mby\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1849\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1850\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, obj, keys, axis, level, grouper, exclusions, selection, as_index, sort, group_keys, squeeze, **kwargs)\u001b[0m\n\u001b[1;32m 514\u001b[0m \u001b[0mlevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlevel\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 515\u001b[0m \u001b[0msort\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msort\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 516\u001b[0;31m mutated=self.mutated)\n\u001b[0m\u001b[1;32m 517\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 518\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby.py\u001b[0m in \u001b[0;36m_get_grouper\u001b[0;34m(obj, key, axis, level, sort, mutated, validate)\u001b[0m\n\u001b[1;32m 2932\u001b[0m \u001b[0min_axis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlevel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgpr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgpr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2933\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2934\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgpr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2935\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgpr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mGrouper\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mgpr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkey\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2936\u001b[0m \u001b[0;31m# Add key to exclusions\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'title_year'" + ] } ], "source": [ @@ -2680,455 +2535,9 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/tommy/anaconda3/lib/python3.6/site-packages/pandas/core/groupby.py:4291: FutureWarning: using a dict with renaming is deprecated and will be removed in a future version\n", - " return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
msdf
title_year
1916.0123.000000NaN
1920.0110.000000NaN
1925.0151.000000NaN
1927.0145.000000NaN
1929.0105.0000007.071068
1930.096.000000NaN
1932.079.000000NaN
1933.077.50000016.263456
1934.065.000000NaN
1935.081.000000NaN
1936.093.5000009.192388
1937.092.00000012.727922
1938.0116.00000014.142136
1939.0149.33333367.002488
1940.0108.00000021.059440
1941.0118.000000NaN
1942.076.0000008.485281
1943.0122.000000NaN
1944.0101.000000NaN
1945.0103.75000010.500000
1946.0144.66666727.006172
1947.0101.66666714.224392
1948.098.66666729.143324
1949.0106.0000004.242641
1950.0107.000000NaN
1951.0134.66666732.593455
1952.0106.75000031.223656
1953.0106.25000024.878036
1954.0140.60000036.073536
1955.0112.50000019.091883
1956.0103.00000013.114877
1957.0128.50000045.961941
.........
1985.0108.72413820.325734
1986.0104.65384614.707664
1987.0105.50000029.359837
1988.0107.00000018.638669
1989.0113.12121219.582769
1990.0114.10000032.563307
1991.0113.06451626.856328
1992.0116.08823524.342714
1993.0120.22916743.293280
1994.0111.48148125.176165
1995.0114.52857124.225224
1996.0110.01010118.521083
1997.0110.25423721.591858
1998.0109.46268720.978204
1999.0108.64285720.332059
2000.0107.53529420.673610
2001.0106.94148923.319033
2002.0104.22966518.260268
2003.0106.40236722.667451
2004.0107.29906520.988579
2005.0107.50454522.870790
2006.0107.14705920.243023
2007.0106.89705924.419854
2008.0105.38222217.785318
2009.0105.12741318.262597
2010.0105.43421115.323819
2011.0105.03571416.797356
2012.0105.90000022.001640
2013.0108.09322022.074145
2014.0105.42629519.384879
2015.0106.02232117.848250
2016.0109.63207517.020618
\n", - "

91 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " m sdf\n", - "title_year \n", - "1916.0 123.000000 NaN\n", - "1920.0 110.000000 NaN\n", - "1925.0 151.000000 NaN\n", - "1927.0 145.000000 NaN\n", - "1929.0 105.000000 7.071068\n", - "1930.0 96.000000 NaN\n", - "1932.0 79.000000 NaN\n", - "1933.0 77.500000 16.263456\n", - "1934.0 65.000000 NaN\n", - "1935.0 81.000000 NaN\n", - "1936.0 93.500000 9.192388\n", - "1937.0 92.000000 12.727922\n", - "1938.0 116.000000 14.142136\n", - "1939.0 149.333333 67.002488\n", - "1940.0 108.000000 21.059440\n", - "1941.0 118.000000 NaN\n", - "1942.0 76.000000 8.485281\n", - "1943.0 122.000000 NaN\n", - "1944.0 101.000000 NaN\n", - "1945.0 103.750000 10.500000\n", - "1946.0 144.666667 27.006172\n", - "1947.0 101.666667 14.224392\n", - "1948.0 98.666667 29.143324\n", - "1949.0 106.000000 4.242641\n", - "1950.0 107.000000 NaN\n", - "1951.0 134.666667 32.593455\n", - "1952.0 106.750000 31.223656\n", - "1953.0 106.250000 24.878036\n", - "1954.0 140.600000 36.073536\n", - "1955.0 112.500000 19.091883\n", - "1956.0 103.000000 13.114877\n", - "1957.0 128.500000 45.961941\n", - "... ... ...\n", - "1985.0 108.724138 20.325734\n", - "1986.0 104.653846 14.707664\n", - "1987.0 105.500000 29.359837\n", - "1988.0 107.000000 18.638669\n", - "1989.0 113.121212 19.582769\n", - "1990.0 114.100000 32.563307\n", - "1991.0 113.064516 26.856328\n", - "1992.0 116.088235 24.342714\n", - "1993.0 120.229167 43.293280\n", - "1994.0 111.481481 25.176165\n", - "1995.0 114.528571 24.225224\n", - "1996.0 110.010101 18.521083\n", - "1997.0 110.254237 21.591858\n", - "1998.0 109.462687 20.978204\n", - "1999.0 108.642857 20.332059\n", - "2000.0 107.535294 20.673610\n", - "2001.0 106.941489 23.319033\n", - "2002.0 104.229665 18.260268\n", - "2003.0 106.402367 22.667451\n", - "2004.0 107.299065 20.988579\n", - "2005.0 107.504545 22.870790\n", - "2006.0 107.147059 20.243023\n", - "2007.0 106.897059 24.419854\n", - "2008.0 105.382222 17.785318\n", - "2009.0 105.127413 18.262597\n", - "2010.0 105.434211 15.323819\n", - "2011.0 105.035714 16.797356\n", - "2012.0 105.900000 22.001640\n", - "2013.0 108.093220 22.074145\n", - "2014.0 105.426295 19.384879\n", - "2015.0 106.022321 17.848250\n", - "2016.0 109.632075 17.020618\n", - "\n", - "[91 rows x 2 columns]" - ] - }, - "execution_count": 124, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "to_plot = df.groupby(df.title_year).agg({df.duration.name:{'m':np.mean, 'sdf':np.std}})\n", "\n", @@ -3136,7 +2545,7 @@ "\n", "#to_plot = to_plot.assign(low = lambda df: df.mean - df.std)\n", "\n", - "to_plot" + "to_plot.plot()" ] }, { @@ -3173,6 +2582,78 @@ "cell_type": "markdown", "metadata": {}, "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# https://www.kaggle.com/zynicide/wine-reviews" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {