From ea763881dcb08791a166570ba6c1631f2f86f701 Mon Sep 17 00:00:00 2001 From: tommyod Date: Mon, 26 Mar 2018 18:52:14 +0200 Subject: [PATCH] Updated readme with 2 video links --- README.md | 2 + cheat_sheet/Cheat Sheet.ipynb | 3100 ++++++++++++++++++++++++++++++++- 2 files changed, 3100 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 77614ce..2fc95ab 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,8 @@ quantified roughly as follows: | [Pandas: .head() to .tail()](https://www.youtube.com/watch?v=7vuO9QXDN50) [[repo](https://github.com/TomAugspurger/pydata-chi-h2t)] | Tom Augspurger | PyData | 1:26 | 3000 | 2016 | :sweat_smile: | | [Performance Pandas](https://www.youtube.com/watch?v=xUBoPK6FGIU) (london) [[repo](https://github.com/jreback/pydata2015-london)] | Jeff Reback | PyData | 0:43 | 2000 | 2015 | :sweat_smile: | | [Performance Pandas](https://www.youtube.com/watch?v=xUBoPK6FGIU) (NYC) [[repo](https://github.com/jreback/pydatanyc2015)] | Jeff Reback | PyData | 1:26 | 3000 | 2015 | :sweat_smile: | +| [Python Data Science with pandas](https://www.youtube.com/watch?v=ikOEn8jY2Is) [[repo](https://github.com/mattharrison/PyCharm-2018-Webcast)] | Matt Harrison | JetBrainsTV | 1:09 | 2000 | 2018 | :smiley: | +| [What is the Future of Pandas](https://www.youtube.com/watch?v=_-gJtO0XR48) [[slides](https://www.slideshare.net/JeffReback/future-of-pandas-82901487)] | Jeff Reback | PyData | 0:31 | 4000 | 2017 | :smiley: | *Know of a recent, good video? Send a pull request!* :+1: diff --git a/cheat_sheet/Cheat Sheet.ipynb b/cheat_sheet/Cheat Sheet.ipynb index ab64c47..72ecffc 100644 --- a/cheat_sheet/Cheat Sheet.ipynb +++ b/cheat_sheet/Cheat Sheet.ipynb @@ -36,13 +36,3109 @@ { "cell_type": "markdown", "metadata": {}, - "source": [] + "source": [ + "# The setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Python and Anaconda\n", + "\n", + "If you haven't done it, start by installing Python.\n", + "The [Anaconda Distribution](https://www.anaconda.com/download/) is great, install version `3.X`.\n", + "- If you're on Windows, you will get a program called *Anaconda Prompt*. Open in at run `conda --version` to verify that everything works.\n", + "- If you're on Linux, open a terminal and run `conda --version`.\n", + "\n", + "## Pandas, NumPy and matplotlib\n", + "\n", + "To install packages, run `conda install `. The Anaconda distribution comes with the three packages we will require, namely [pandas](https://pandas.pydata.org/), [NumPy](http://www.numpy.org/) and [matplotlib](https://matplotlib.org/).\n", + "\n", + "- **NumPy** implements $n$-dimensional arrays in Python for efficient computations. See the [arXiv](https://arxiv.org/pdf/1102.1523.pdf) paper for a nice introduction. To learn basic NumPy, consider doing these [100 NumPy exercises](https://github.com/rougier/numpy-100).\n", + "- **Matplotlib** is the most popular library for plotting in Python. See the beautiful [gallery](https://matplotlib.org/gallery.html) to get an overview of the capabilities of matplotlib.\n", + "- **Pandas** is a library for data analysis based on two objects, the [Series](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html) and the [DataFrame](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html).\n", + "\n", + "## Jupyter\n", + "\n", + "The [Jupyter Notebook](https://jupyter-notebook.readthedocs.io/en/stable/) is an environment in which you can run Python code, display graphs and work with data interactively. Think of it as a tool between the simple terminal and the full fledged IDE. Move to a directory using the `cd` command in the terminal, then run `jupyter notebook` to start up a notebook. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Importing packages" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pandas version 0.22.0\n", + "numpy version 1.13.3\n", + "matplotlib version 2.1.1\n" + ] + } + ], + "source": [ + "for lib in [pd, np, matplotlib]:\n", + " print(f'{lib.__name__.ljust(12)} version {lib.__version__}')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option(\"display.max_rows\", 2**6)\n", + "pd.set_option(\"display.max_columns\", 2**6)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "2**6" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Importing data\n", + "\n", + "We'll use" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,actor_1_name,movie_title,num_voted_users,cast_total_facebook_likes,actor_3_name,facenumber_in_poster,plot_keywords,movie_imdb_link,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes\r", + "\r\n", + "Color,James Cameron,723,178,0,855,Joel David Moore,1000,760505847,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar ,886204,4834,Wes Studi,0,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1,3054,English,USA,PG-13,237000000,2009,936,7.9,1.78,33000\r", + "\r\n" + ] + } + ], + "source": [ + "!head data/movie_metadata.csv -n 2" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(5043, 28)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(r'data/movie_metadata.csv', sep = ',')\n", + "df.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Summarizing data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(5043, 28)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "color object\n", + "director_name object\n", + "num_critic_for_reviews float64\n", + "duration float64\n", + "director_facebook_likes float64\n", + "actor_3_facebook_likes float64\n", + "actor_2_name object\n", + "actor_1_facebook_likes float64\n", + "gross float64\n", + "genres object\n", + "dtype: object" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01
colorColorColor
director_nameJames CameronGore Verbinski
num_critic_for_reviews723302
duration178169
director_facebook_likes0563
actor_3_facebook_likes8551000
actor_2_nameJoel David MooreOrlando Bloom
actor_1_facebook_likes100040000
gross7.60506e+083.09404e+08
genresAction|Adventure|Fantasy|Sci-FiAction|Adventure|Fantasy
actor_1_nameCCH PounderJohnny Depp
movie_titleAvatarPirates of the Caribbean: At World's End
num_voted_users886204471220
cast_total_facebook_likes483448350
actor_3_nameWes StudiJack Davenport
facenumber_in_poster00
plot_keywordsavatar|future|marine|native|paraplegicgoddess|marriage ceremony|marriage proposal|pi...
movie_imdb_linkhttp://www.imdb.com/title/tt0499549/?ref_=fn_t...http://www.imdb.com/title/tt0449088/?ref_=fn_t...
num_user_for_reviews30541238
languageEnglishEnglish
countryUSAUSA
content_ratingPG-13PG-13
budget2.37e+083e+08
title_year20092007
actor_2_facebook_likes9365000
imdb_score7.97.1
aspect_ratio1.782.35
movie_facebook_likes330000
\n", + "
" + ], + "text/plain": [ + " 0 \\\n", + "color Color \n", + "director_name James Cameron \n", + "num_critic_for_reviews 723 \n", + "duration 178 \n", + "director_facebook_likes 0 \n", + "actor_3_facebook_likes 855 \n", + "actor_2_name Joel David Moore \n", + "actor_1_facebook_likes 1000 \n", + "gross 7.60506e+08 \n", + "genres Action|Adventure|Fantasy|Sci-Fi \n", + "actor_1_name CCH Pounder \n", + "movie_title Avatar  \n", + "num_voted_users 886204 \n", + "cast_total_facebook_likes 4834 \n", + "actor_3_name Wes Studi \n", + "facenumber_in_poster 0 \n", + "plot_keywords avatar|future|marine|native|paraplegic \n", + "movie_imdb_link http://www.imdb.com/title/tt0499549/?ref_=fn_t... \n", + "num_user_for_reviews 3054 \n", + "language English \n", + "country USA \n", + "content_rating PG-13 \n", + "budget 2.37e+08 \n", + "title_year 2009 \n", + "actor_2_facebook_likes 936 \n", + "imdb_score 7.9 \n", + "aspect_ratio 1.78 \n", + "movie_facebook_likes 33000 \n", + "\n", + " 1 \n", + "color Color \n", + "director_name Gore Verbinski \n", + "num_critic_for_reviews 302 \n", + "duration 169 \n", + "director_facebook_likes 563 \n", + "actor_3_facebook_likes 1000 \n", + "actor_2_name Orlando Bloom \n", + "actor_1_facebook_likes 40000 \n", + "gross 3.09404e+08 \n", + "genres Action|Adventure|Fantasy \n", + "actor_1_name Johnny Depp \n", + "movie_title Pirates of the Caribbean: At World's End  \n", + "num_voted_users 471220 \n", + "cast_total_facebook_likes 48350 \n", + "actor_3_name Jack Davenport \n", + "facenumber_in_poster 0 \n", + "plot_keywords goddess|marriage ceremony|marriage proposal|pi... \n", + "movie_imdb_link http://www.imdb.com/title/tt0449088/?ref_=fn_t... \n", + "num_user_for_reviews 1238 \n", + "language English \n", + "country USA \n", + "content_rating PG-13 \n", + "budget 3e+08 \n", + "title_year 2007 \n", + "actor_2_facebook_likes 5000 \n", + "imdb_score 7.1 \n", + "aspect_ratio 2.35 \n", + "movie_facebook_likes 0 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(2).T" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "cast_total_facebook_likes 0\n", + "imdb_score 0\n", + "movie_imdb_link 0\n", + "num_voted_users 0\n", + "movie_title 0\n", + "genres 0\n", + "movie_facebook_likes 0\n", + "country 5\n", + "actor_1_facebook_likes 7\n", + "actor_1_name 7\n", + "language 12\n", + "actor_2_facebook_likes 13\n", + "actor_2_name 13\n", + "facenumber_in_poster 13\n", + "duration 15\n", + "color 19\n", + "num_user_for_reviews 21\n", + "actor_3_facebook_likes 23\n", + "actor_3_name 23\n", + "num_critic_for_reviews 50\n", + "director_facebook_likes 104\n", + "director_name 104\n", + "title_year 108\n", + "plot_keywords 153\n", + "content_rating 303\n", + "aspect_ratio 329\n", + "budget 492\n", + "gross 884\n", + "dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().sum().sort_values()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# len(df), shape, value_counts, head, tail, max(), min(), mean, dtype, info(), \n", + "# describe(), memory_usage(), scatter matrix, corr, isnull, notnull, unique(), nlargest" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Selecting and computing" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# - **Selecting and computing**: select subset of row and cols, .loc, .iloc, \n", + "# drop columns, assign, apply/map/applymap, multiindex" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movie_titledirector_namecountrycontent_ratingimdb_score
0AvatarJames CameronUSAPG-137.9
1Pirates of the Caribbean: At World's EndGore VerbinskiUSAPG-137.1
2SpectreSam MendesUKPG-136.8
\n", + "
" + ], + "text/plain": [ + " movie_title director_name country \\\n", + "0 Avatar  James Cameron USA \n", + "1 Pirates of the Caribbean: At World's End  Gore Verbinski USA \n", + "2 Spectre  Sam Mendes UK \n", + "\n", + " content_rating imdb_score \n", + "0 PG-13 7.9 \n", + "1 PG-13 7.1 \n", + "2 PG-13 6.8 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "columns = ['movie_title', 'director_name', 'country', 'content_rating', 'imdb_score']\n", + "df[columns].head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movie_titledirector_namecountrycontent_ratingimdb_score
100The Fast and the FuriousRob CohenUSAPG-136.7
101The Curious Case of Benjamin ButtonDavid FincherUSAPG-137.8
102X-Men: First ClassMatthew VaughnUSAPG-137.8
\n", + "
" + ], + "text/plain": [ + " movie_title director_name country \\\n", + "100 The Fast and the Furious  Rob Cohen USA \n", + "101 The Curious Case of Benjamin Button  David Fincher USA \n", + "102 X-Men: First Class  Matthew Vaughn USA \n", + "\n", + " content_rating imdb_score \n", + "100 PG-13 6.7 \n", + "101 PG-13 7.8 \n", + "102 PG-13 7.8 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc[100:102, columns]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movie_titledirector_namecountrycontent_ratingimdb_scoredirector_facebook_likesgross
0AvatarJames CameronUSAPG-137.90.0760505847.0
1Pirates of the Caribbean: At World's EndGore VerbinskiUSAPG-137.1563.0309404152.0
\n", + "
" + ], + "text/plain": [ + " movie_title director_name country \\\n", + "0 Avatar  James Cameron USA \n", + "1 Pirates of the Caribbean: At World's End  Gore Verbinski USA \n", + "\n", + " content_rating imdb_score director_facebook_likes gross \n", + "0 PG-13 7.9 0.0 760505847.0 \n", + "1 PG-13 7.1 563.0 309404152.0 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_cols = df.loc[:, columns + ['director_facebook_likes', 'gross']]\n", + "df_cols.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movie_titledirector_namecountrycontent_ratingimdb_scoregross
0AvatarJames CameronUSAPG-137.9760505847.0
1Pirates of the Caribbean: At World's EndGore VerbinskiUSAPG-137.1309404152.0
\n", + "
" + ], + "text/plain": [ + " movie_title director_name country \\\n", + "0 Avatar  James Cameron USA \n", + "1 Pirates of the Caribbean: At World's End  Gore Verbinski USA \n", + "\n", + " content_rating imdb_score gross \n", + "0 PG-13 7.9 760505847.0 \n", + "1 PG-13 7.1 309404152.0 " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_cols = df_cols.drop(columns=['director_facebook_likes'])\n", + "df_cols.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movie_titledirector_namecountrycontent_ratingimdb_scoregross
2765Towering InfernoJohn BlanchardCanadaNaN9.5NaN
1937The Shawshank RedemptionFrank DarabontUSAR9.328341469.0
3466The GodfatherFrancis Ford CoppolaUSAR9.2134821952.0
\n", + "
" + ], + "text/plain": [ + " movie_title director_name country \\\n", + "2765 Towering Inferno  John Blanchard Canada \n", + "1937 The Shawshank Redemption  Frank Darabont USA \n", + "3466 The Godfather  Francis Ford Coppola USA \n", + "\n", + " content_rating imdb_score gross \n", + "2765 NaN 9.5 NaN \n", + "1937 R 9.3 28341469.0 \n", + "3466 R 9.2 134821952.0 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_cols.nlargest(3, columns=['imdb_score'], keep='first')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "df_cols = df_cols.assign(gross_log = lambda df: np.log(df.gross))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
imdb_scoregrossgross_log
count5,0434,1594,159
mean648,468,40816
std168,452,9902
min21625
50%725,517,50017
max10760,505,84720
\n", + "
" + ], + "text/plain": [ + " imdb_score gross gross_log\n", + "count 5,043 4,159 4,159\n", + "mean 6 48,468,408 16\n", + "std 1 68,452,990 2\n", + "min 2 162 5\n", + "50% 7 25,517,500 17\n", + "max 10 760,505,847 20" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_cols.describe(percentiles=[0.5]).applymap(lambda x: '{:,}'.format(round(x)))" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
imdb_scoregrossgross_log
imdb_score1.0000000.1980210.074280
gross0.1980211.0000000.616034
gross_log0.0742800.6160341.000000
\n", + "
" + ], + "text/plain": [ + " imdb_score gross gross_log\n", + "imdb_score 1.000000 0.198021 0.074280\n", + "gross 0.198021 1.000000 0.616034\n", + "gross_log 0.074280 0.616034 1.000000" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_cols.corr()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Filtering and sorting" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movie_titledirector_namecountrycontent_ratingimdb_scoregrossgross_log
2SpectreSam MendesUKPG-136.8200074175.019.114199
4Star Wars: Episode VII - The Force Awakens  ...Doug WalkerNaNNaN7.1NaNNaN
9Harry Potter and the Half-Blood PrinceDavid YatesUKPG7.5301956980.019.525795
\n", + "
" + ], + "text/plain": [ + " movie_title director_name country \\\n", + "2 Spectre  Sam Mendes UK \n", + "4 Star Wars: Episode VII - The Force Awakens  ... Doug Walker NaN \n", + "9 Harry Potter and the Half-Blood Prince  David Yates UK \n", + "\n", + " content_rating imdb_score gross gross_log \n", + "2 PG-13 6.8 200074175.0 19.114199 \n", + "4 NaN 7.1 NaN NaN \n", + "9 PG 7.5 301956980.0 19.525795 " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_cols[df_cols.country != 'USA'].head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movie_titledirector_namecountrycontent_ratingimdb_scoregrossgross_log
4498The Good, the Bad and the UglySergio LeoneItalyApproved8.96100000.015.623799
270The Lord of the Rings: The Fellowship of the R...Peter JacksonNew ZealandPG-138.8313837577.019.564386
4029City of GodFernando MeirellesBrazilR8.77563397.015.838831
\n", + "
" + ], + "text/plain": [ + " movie_title director_name \\\n", + "4498 The Good, the Bad and the Ugly  Sergio Leone \n", + "270 The Lord of the Rings: The Fellowship of the R... Peter Jackson \n", + "4029 City of God  Fernando Meirelles \n", + "\n", + " country content_rating imdb_score gross gross_log \n", + "4498 Italy Approved 8.9 6100000.0 15.623799 \n", + "270 New Zealand PG-13 8.8 313837577.0 19.564386 \n", + "4029 Brazil R 8.7 7563397.0 15.838831 " + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mask = ((df_cols.imdb_score > 8) & (df_cols.country != 'USA') & (df_cols.gross > 10**6))\n", + "df_cols[mask].nlargest(3, columns=['imdb_score'])" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# >=, AND, OR, ==, ~, str.contains, \n", + "# str.startswith, sort_values, sort_index, filtering on sorted/unsorted, isin()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movie_titledirector_namecountrycontent_ratingimdb_scoregrossgross_log
2765Towering InfernoJohn BlanchardCanadaNaN9.5NaNNaN
339The Lord of the Rings: The Return of the KingPeter JacksonUSAPG-138.9377019252.019.747807
270The Lord of the Rings: The Fellowship of the R...Peter JacksonNew ZealandPG-138.8313837577.019.564386
340The Lord of the Rings: The Two TowersPeter JacksonUSAPG-138.7340478898.019.645864
1196The Conjuring 2James WanUSAR7.8102310175.018.443520
\n", + "
" + ], + "text/plain": [ + " movie_title director_name \\\n", + "2765 Towering Inferno  John Blanchard \n", + "339 The Lord of the Rings: The Return of the King  Peter Jackson \n", + "270 The Lord of the Rings: The Fellowship of the R... Peter Jackson \n", + "340 The Lord of the Rings: The Two Towers  Peter Jackson \n", + "1196 The Conjuring 2  James Wan \n", + "\n", + " country content_rating imdb_score gross gross_log \n", + "2765 Canada NaN 9.5 NaN NaN \n", + "339 USA PG-13 8.9 377019252.0 19.747807 \n", + "270 New Zealand PG-13 8.8 313837577.0 19.564386 \n", + "340 USA PG-13 8.7 340478898.0 19.645864 \n", + "1196 USA R 7.8 102310175.0 18.443520 " + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_cols[df_cols.movie_title.str.lower().str.contains(\"ring\")].nlargest(5, columns=['imdb_score'])" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "USA 3807\n", + "UK 448\n", + "France 154\n", + "Canada 126\n", + "Germany 97\n", + "Name: country, dtype: int64" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.country.value_counts().head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Afghanistan 1\n", + "Argentina 4\n", + "Aruba 1\n", + "Australia 55\n", + "Bahamas 1\n", + "Name: country, dtype: int64" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.country.value_counts().sort_index().head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Split-apply-combine and pivots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "- **Split-apply-combine and pivots**: groupby, dt.month, dt.year, groupby.mean(), agg, stack, unstack, pivot, melt, merge" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Directors with the most movies" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movie_title
director_name
Steven Spielberg26
Woody Allen22
Clint Eastwood20
Martin Scorsese20
Ridley Scott16
\n", + "
" + ], + "text/plain": [ + " movie_title\n", + "director_name \n", + "Steven Spielberg 26\n", + "Woody Allen 22\n", + "Clint Eastwood 20\n", + "Martin Scorsese 20\n", + "Ridley Scott 16" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(df_cols.groupby(df.director_name).nunique().movie_title.nlargest(5).to_frame())" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
imdb_score
director_name
John Blanchard9.5
Cary Bell8.7
Mitchell Altieri8.7
Sadyk Sher-Niyaz8.7
Charles Chaplin8.6
\n", + "
" + ], + "text/plain": [ + " imdb_score\n", + "director_name \n", + "John Blanchard 9.5\n", + "Cary Bell 8.7\n", + "Mitchell Altieri 8.7\n", + "Sadyk Sher-Niyaz 8.7\n", + "Charles Chaplin 8.6" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(df_cols.groupby(df.director_name).mean().imdb_score.nlargest(5).to_frame())" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movie_titledirector_namecountrycontent_ratingimdb_scoregrossgross_log
0AvatarJames CameronUSAPG-137.9760505847.020.449494
1Pirates of the Caribbean: At World's EndGore VerbinskiUSAPG-137.1309404152.019.550159
2SpectreSam MendesUKPG-136.8200074175.019.114199
3The Dark Knight RisesChristopher NolanUSAPG-138.5448130642.019.920595
4Star Wars: Episode VII - The Force Awakens  ...Doug WalkerNaNNaN7.1NaNNaN
\n", + "
" + ], + "text/plain": [ + " movie_title director_name \\\n", + "0 Avatar  James Cameron \n", + "1 Pirates of the Caribbean: At World's End  Gore Verbinski \n", + "2 Spectre  Sam Mendes \n", + "3 The Dark Knight Rises  Christopher Nolan \n", + "4 Star Wars: Episode VII - The Force Awakens  ... Doug Walker \n", + "\n", + " country content_rating imdb_score gross gross_log \n", + "0 USA PG-13 7.9 760505847.0 20.449494 \n", + "1 USA PG-13 7.1 309404152.0 19.550159 \n", + "2 UK PG-13 6.8 200074175.0 19.114199 \n", + "3 USA PG-13 8.5 448130642.0 19.920595 \n", + "4 NaN NaN 7.1 NaN NaN " + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_cols.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
imdb_scoregross_logmovie_title
director_name
A. Raven Cruz1.90.0000001
Aaron Hann6.00.0000001
Aaron Schneider7.116.0321621
\n", + "
" + ], + "text/plain": [ + " imdb_score gross_log movie_title\n", + "director_name \n", + "A. Raven Cruz 1.9 0.000000 1\n", + "Aaron Hann 6.0 0.000000 1\n", + "Aaron Schneider 7.1 16.032162 1" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "director_stats = (df_cols.groupby(df.director_name).agg({'imdb_score':np.mean, 'gross_log':np.sum, 'movie_title':'nunique'}))\n", + "\n", + "director_stats.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
imdb_scoregross_logmovie_title
director_name
A. Raven Cruz-3.885663-0.737455-0.488133
Aaron Hann-0.213965-0.737455-0.488133
Aaron Schneider0.771125-0.321155-0.488133
\n", + "
" + ], + "text/plain": [ + " imdb_score gross_log movie_title\n", + "director_name \n", + "A. Raven Cruz -3.885663 -0.737455 -0.488133\n", + "Aaron Hann -0.213965 -0.737455 -0.488133\n", + "Aaron Schneider 0.771125 -0.321155 -0.488133" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "director_stats_norm = ((director_stats - director_stats.mean()) / director_stats.std())\n", + "director_stats_norm.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
imdb_scoregross_logmovie_titlescore
director_name
Steven Spielberg1.11211711.41921611.60925424.140587
Woody Allen0.6897127.2995739.67367217.662958
Clint Eastwood0.8830678.0419618.70588117.630909
\n", + "
" + ], + "text/plain": [ + " imdb_score gross_log movie_title score\n", + "director_name \n", + "Steven Spielberg 1.112117 11.419216 11.609254 24.140587\n", + "Woody Allen 0.689712 7.299573 9.673672 17.662958\n", + "Clint Eastwood 0.883067 8.041961 8.705881 17.630909" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "director_stats_norm.assign(score = lambda df: df.sum(axis = 1)).nlargest(3, 'score')" + ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
content_ratingApprovedGGPMNC-17Not RatedPGPG-13PassedRTV-14TV-GTV-PGUnratedX
title_year
2012.00.02.00.00.00.018.026.071.00.084.00.01.00.00.00.0
2013.00.01.00.00.00.014.026.068.00.093.01.00.00.01.00.0
2014.00.03.00.00.00.09.028.072.00.086.00.00.00.05.00.0
2015.00.01.00.00.00.05.022.059.00.077.01.00.00.01.00.0
2016.00.00.00.00.00.02.011.041.00.031.00.00.00.00.00.0
\n", + "
" + ], + "text/plain": [ + "content_rating Approved G GP M NC-17 Not Rated PG PG-13 \\\n", + "title_year \n", + "2012.0 0.0 2.0 0.0 0.0 0.0 18.0 26.0 71.0 \n", + "2013.0 0.0 1.0 0.0 0.0 0.0 14.0 26.0 68.0 \n", + "2014.0 0.0 3.0 0.0 0.0 0.0 9.0 28.0 72.0 \n", + "2015.0 0.0 1.0 0.0 0.0 0.0 5.0 22.0 59.0 \n", + "2016.0 0.0 0.0 0.0 0.0 0.0 2.0 11.0 41.0 \n", + "\n", + "content_rating Passed R TV-14 TV-G TV-PG Unrated X \n", + "title_year \n", + "2012.0 0.0 84.0 0.0 1.0 0.0 0.0 0.0 \n", + "2013.0 0.0 93.0 1.0 0.0 0.0 1.0 0.0 \n", + "2014.0 0.0 86.0 0.0 0.0 0.0 5.0 0.0 \n", + "2015.0 0.0 77.0 1.0 0.0 0.0 1.0 0.0 \n", + "2016.0 0.0 31.0 0.0 0.0 0.0 0.0 0.0 " + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(df.loc[:, ('title_year', 'content_rating', 'movie_title')]\n", + " .groupby(['title_year', 'content_rating']).nunique().movie_title).unstack(1).fillna(0).tail(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Pivot table" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenresactor_1_namemovie_titlenum_voted_userscast_total_facebook_likesactor_3_namefacenumber_in_posterplot_keywordsmovie_imdb_linknum_user_for_reviewslanguagecountrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
0ColorJames Cameron723.0178.00.0855.0Joel David Moore1000.0760505847.0Action|Adventure|Fantasy|Sci-FiCCH PounderAvatar8862044834Wes Studi0.0avatar|future|marine|native|paraplegichttp://www.imdb.com/title/tt0499549/?ref_=fn_t...3054.0EnglishUSAPG-13237000000.02009.0936.07.91.7833000
1ColorGore Verbinski302.0169.0563.01000.0Orlando Bloom40000.0309404152.0Action|Adventure|FantasyJohnny DeppPirates of the Caribbean: At World's End47122048350Jack Davenport0.0goddess|marriage ceremony|marriage proposal|pi...http://www.imdb.com/title/tt0449088/?ref_=fn_t...1238.0EnglishUSAPG-13300000000.02007.05000.07.12.350
2ColorSam Mendes602.0148.00.0161.0Rory Kinnear11000.0200074175.0Action|Adventure|ThrillerChristoph WaltzSpectre27586811700Stephanie Sigman1.0bomb|espionage|sequel|spy|terroristhttp://www.imdb.com/title/tt2379713/?ref_=fn_t...994.0EnglishUKPG-13245000000.02015.0393.06.82.3585000
\n", + "
" + ], + "text/plain": [ + " color director_name num_critic_for_reviews duration \\\n", + "0 Color James Cameron 723.0 178.0 \n", + "1 Color Gore Verbinski 302.0 169.0 \n", + "2 Color Sam Mendes 602.0 148.0 \n", + "\n", + " director_facebook_likes actor_3_facebook_likes actor_2_name \\\n", + "0 0.0 855.0 Joel David Moore \n", + "1 563.0 1000.0 Orlando Bloom \n", + "2 0.0 161.0 Rory Kinnear \n", + "\n", + " actor_1_facebook_likes gross genres \\\n", + "0 1000.0 760505847.0 Action|Adventure|Fantasy|Sci-Fi \n", + "1 40000.0 309404152.0 Action|Adventure|Fantasy \n", + "2 11000.0 200074175.0 Action|Adventure|Thriller \n", + "\n", + " actor_1_name movie_title \\\n", + "0 CCH Pounder Avatar  \n", + "1 Johnny Depp Pirates of the Caribbean: At World's End  \n", + "2 Christoph Waltz Spectre  \n", + "\n", + " num_voted_users cast_total_facebook_likes actor_3_name \\\n", + "0 886204 4834 Wes Studi \n", + "1 471220 48350 Jack Davenport \n", + "2 275868 11700 Stephanie Sigman \n", + "\n", + " facenumber_in_poster plot_keywords \\\n", + "0 0.0 avatar|future|marine|native|paraplegic \n", + "1 0.0 goddess|marriage ceremony|marriage proposal|pi... \n", + "2 1.0 bomb|espionage|sequel|spy|terrorist \n", + "\n", + " movie_imdb_link num_user_for_reviews \\\n", + "0 http://www.imdb.com/title/tt0499549/?ref_=fn_t... 3054.0 \n", + "1 http://www.imdb.com/title/tt0449088/?ref_=fn_t... 1238.0 \n", + "2 http://www.imdb.com/title/tt2379713/?ref_=fn_t... 994.0 \n", + "\n", + " language country content_rating budget title_year \\\n", + "0 English USA PG-13 237000000.0 2009.0 \n", + "1 English USA PG-13 300000000.0 2007.0 \n", + "2 English UK PG-13 245000000.0 2015.0 \n", + "\n", + " actor_2_facebook_likes imdb_score aspect_ratio movie_facebook_likes \n", + "0 936.0 7.9 1.78 33000 \n", + "1 5000.0 7.1 2.35 0 \n", + "2 393.0 6.8 2.35 85000 " + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
content_ratingApprovedGGPMNC-17Not RatedPGPG-13PassedRTV-14TV-GTV-PGUnratedX
title_year
2012.00.02.00.00.00.018.026.071.00.084.00.01.00.00.00.0
2013.00.01.00.00.00.014.026.068.00.093.01.00.00.01.00.0
2014.00.03.00.00.00.09.028.072.00.086.00.00.00.05.00.0
2015.00.01.00.00.00.05.022.059.00.077.01.00.00.01.00.0
2016.00.00.00.00.00.02.011.041.00.031.00.00.00.00.00.0
\n", + "
" + ], + "text/plain": [ + "content_rating Approved G GP M NC-17 Not Rated PG PG-13 \\\n", + "title_year \n", + "2012.0 0.0 2.0 0.0 0.0 0.0 18.0 26.0 71.0 \n", + "2013.0 0.0 1.0 0.0 0.0 0.0 14.0 26.0 68.0 \n", + "2014.0 0.0 3.0 0.0 0.0 0.0 9.0 28.0 72.0 \n", + "2015.0 0.0 1.0 0.0 0.0 0.0 5.0 22.0 59.0 \n", + "2016.0 0.0 0.0 0.0 0.0 0.0 2.0 11.0 41.0 \n", + "\n", + "content_rating Passed R TV-14 TV-G TV-PG Unrated X \n", + "title_year \n", + "2012.0 0.0 84.0 0.0 1.0 0.0 0.0 0.0 \n", + "2013.0 0.0 93.0 1.0 0.0 0.0 1.0 0.0 \n", + "2014.0 0.0 86.0 0.0 0.0 0.0 5.0 0.0 \n", + "2015.0 0.0 77.0 1.0 0.0 0.0 1.0 0.0 \n", + "2016.0 0.0 31.0 0.0 0.0 0.0 0.0 0.0 " + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(df\n", + " .pivot_table(\n", + " values='movie_title', \n", + " index='title_year', \n", + " columns='content_rating', \n", + " aggfunc=pd.DataFrame.nunique)\n", + ".fillna(0)\n", + ".tail(5))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "values : column to aggregate, optional\n", + "\n", + "index : column, Grouper, array, or list of the previous\n", + "\n", + " If an array is passed, it must be the same length as the data. The list can contain any of the other types (except list). Keys to group by on the pivot table index. If an array is passed, it is being used as the same manner as column values.\n", + "\n", + "columns : column, Grouper, array, or list of the previous\n", + "\n", + " If an array is passed, it must be the same length as the data. The list can contain any of the other types (except list). Keys to group by on the pivot table column. If an array is passed, it is being used as the same manner as column values.\n", + "\n", + "aggfunc : function or list of functions, default numpy.mean\n", + "\n", + " If list of functions passed, the resulting pivot table will have hierarchical columns whose top level are the function names (inferred from the function objects themselves)\n", + "\n", + "fill_value : scalar, default None\n", + "\n", + " Value to replace missing values with\n", + "\n", + "margins : boolean, default False\n", + "\n", + " Add all row / columns (e.g. for subtotal / grand totals)\n", + "\n", + "dropna : boolean, default True\n", + "\n", + " Do not include columns whose entries are all NaN\n", + "\n", + "margins_name : string, default ‘All’\n", + "\n", + " Name of the row / column that will contain the totals when margins is True.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plotting" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/tommy/anaconda3/lib/python3.6/site-packages/pandas/core/groupby.py:4291: FutureWarning: using a dict with renaming is deprecated and will be removed in a future version\n", + " return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
msdf
title_year
1916.0123.000000NaN
1920.0110.000000NaN
1925.0151.000000NaN
1927.0145.000000NaN
1929.0105.0000007.071068
1930.096.000000NaN
1932.079.000000NaN
1933.077.50000016.263456
1934.065.000000NaN
1935.081.000000NaN
1936.093.5000009.192388
1937.092.00000012.727922
1938.0116.00000014.142136
1939.0149.33333367.002488
1940.0108.00000021.059440
1941.0118.000000NaN
1942.076.0000008.485281
1943.0122.000000NaN
1944.0101.000000NaN
1945.0103.75000010.500000
1946.0144.66666727.006172
1947.0101.66666714.224392
1948.098.66666729.143324
1949.0106.0000004.242641
1950.0107.000000NaN
1951.0134.66666732.593455
1952.0106.75000031.223656
1953.0106.25000024.878036
1954.0140.60000036.073536
1955.0112.50000019.091883
1956.0103.00000013.114877
1957.0128.50000045.961941
.........
1985.0108.72413820.325734
1986.0104.65384614.707664
1987.0105.50000029.359837
1988.0107.00000018.638669
1989.0113.12121219.582769
1990.0114.10000032.563307
1991.0113.06451626.856328
1992.0116.08823524.342714
1993.0120.22916743.293280
1994.0111.48148125.176165
1995.0114.52857124.225224
1996.0110.01010118.521083
1997.0110.25423721.591858
1998.0109.46268720.978204
1999.0108.64285720.332059
2000.0107.53529420.673610
2001.0106.94148923.319033
2002.0104.22966518.260268
2003.0106.40236722.667451
2004.0107.29906520.988579
2005.0107.50454522.870790
2006.0107.14705920.243023
2007.0106.89705924.419854
2008.0105.38222217.785318
2009.0105.12741318.262597
2010.0105.43421115.323819
2011.0105.03571416.797356
2012.0105.90000022.001640
2013.0108.09322022.074145
2014.0105.42629519.384879
2015.0106.02232117.848250
2016.0109.63207517.020618
\n", + "

91 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " m sdf\n", + "title_year \n", + "1916.0 123.000000 NaN\n", + "1920.0 110.000000 NaN\n", + "1925.0 151.000000 NaN\n", + "1927.0 145.000000 NaN\n", + "1929.0 105.000000 7.071068\n", + "1930.0 96.000000 NaN\n", + "1932.0 79.000000 NaN\n", + "1933.0 77.500000 16.263456\n", + "1934.0 65.000000 NaN\n", + "1935.0 81.000000 NaN\n", + "1936.0 93.500000 9.192388\n", + "1937.0 92.000000 12.727922\n", + "1938.0 116.000000 14.142136\n", + "1939.0 149.333333 67.002488\n", + "1940.0 108.000000 21.059440\n", + "1941.0 118.000000 NaN\n", + "1942.0 76.000000 8.485281\n", + "1943.0 122.000000 NaN\n", + "1944.0 101.000000 NaN\n", + "1945.0 103.750000 10.500000\n", + "1946.0 144.666667 27.006172\n", + "1947.0 101.666667 14.224392\n", + "1948.0 98.666667 29.143324\n", + "1949.0 106.000000 4.242641\n", + "1950.0 107.000000 NaN\n", + "1951.0 134.666667 32.593455\n", + "1952.0 106.750000 31.223656\n", + "1953.0 106.250000 24.878036\n", + "1954.0 140.600000 36.073536\n", + "1955.0 112.500000 19.091883\n", + "1956.0 103.000000 13.114877\n", + "1957.0 128.500000 45.961941\n", + "... ... ...\n", + "1985.0 108.724138 20.325734\n", + "1986.0 104.653846 14.707664\n", + "1987.0 105.500000 29.359837\n", + "1988.0 107.000000 18.638669\n", + "1989.0 113.121212 19.582769\n", + "1990.0 114.100000 32.563307\n", + "1991.0 113.064516 26.856328\n", + "1992.0 116.088235 24.342714\n", + "1993.0 120.229167 43.293280\n", + "1994.0 111.481481 25.176165\n", + "1995.0 114.528571 24.225224\n", + "1996.0 110.010101 18.521083\n", + "1997.0 110.254237 21.591858\n", + "1998.0 109.462687 20.978204\n", + "1999.0 108.642857 20.332059\n", + "2000.0 107.535294 20.673610\n", + "2001.0 106.941489 23.319033\n", + "2002.0 104.229665 18.260268\n", + "2003.0 106.402367 22.667451\n", + "2004.0 107.299065 20.988579\n", + "2005.0 107.504545 22.870790\n", + "2006.0 107.147059 20.243023\n", + "2007.0 106.897059 24.419854\n", + "2008.0 105.382222 17.785318\n", + "2009.0 105.127413 18.262597\n", + "2010.0 105.434211 15.323819\n", + "2011.0 105.035714 16.797356\n", + "2012.0 105.900000 22.001640\n", + "2013.0 108.093220 22.074145\n", + "2014.0 105.426295 19.384879\n", + "2015.0 106.022321 17.848250\n", + "2016.0 109.632075 17.020618\n", + "\n", + "[91 rows x 2 columns]" + ] + }, + "execution_count": 124, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "to_plot = df.groupby(df.title_year).agg({df.duration.name:{'m':np.mean, 'sdf':np.std}})\n", + "\n", + "to_plot.columns = to_plot.columns.droplevel()\n", + "\n", + "#to_plot = to_plot.assign(low = lambda df: df.mean - df.std)\n", + "\n", + "to_plot" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -95,7 +3191,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.2" + "version": "3.6.4" } }, "nbformat": 4,