diff --git a/DataFrames/ALL.ipynb b/DataFrames/ALL.ipynb new file mode 100644 index 0000000..95b169b --- /dev/null +++ b/DataFrames/ALL.ipynb @@ -0,0 +1,1258 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T23:14:45.083313Z", + "start_time": "2020-11-17T23:14:44.818195Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "UserWarning: The Dask Engine for Modin is experimental.\n" + ] + } + ], + "source": [ + "import os\n", + "import sys\n", + "import gc\n", + "from time import time, sleep\n", + "\n", + "import pandas as pd\n", + "import dask.dataframe as dd\n", + "import modin.pandas as mpd\n", + "import vaex\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.functions import sum, avg\n", + "# pandas on ray has moved to Modin\n", + "# import ray.dataframe as rpd" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T22:55:27.722264Z", + "start_time": "2020-11-17T22:55:27.699399Z" + } + }, + "outputs": [], + "source": [ + "# data based on https://www.kaggle.com/c/ieee-fraud-detection/data\n", + "folder = \"/home/vaclav/Data/Kaggle/EEE-CIS_Fraud_Detection\"\n", + "files = [\"train_transaction.csv\", \"train_identity.csv\"]\n", + "paths = [os.path.join(folder, f) for f in files]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T22:55:27.733967Z", + "start_time": "2020-11-17T22:55:27.727006Z" + } + }, + "outputs": [], + "source": [ + "stats = {}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T22:55:27.750077Z", + "start_time": "2020-11-17T22:55:27.737957Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'1.1.4'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T22:56:19.465362Z", + "start_time": "2020-11-17T22:55:27.767439Z" + } + }, + "outputs": [], + "source": [ + "stats[\"pandas\"] = {}\n", + "s = stats[\"pandas\"]\n", + "\n", + "ts = time()\n", + "df = pd.read_csv(paths[0])\n", + "te = time()\n", + "s[\"load_transactions\"] = te-ts\n", + "\n", + "ts = time()\n", + "df2 = pd.read_csv(paths[1])\n", + "te = time()\n", + "s[\"load_identity\"] = te-ts\n", + "\n", + "ts = time()\n", + "dff = df.merge(df2, on=\"TransactionID\")\n", + "te = time()\n", + "s[\"merge\"] = te-ts\n", + "\n", + "ts = time()\n", + "grp = dff[\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\",\"TransactionAmt\"].fillna(\"\")\n", + ".groupby([\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\"])[\"TransactionAmt\"].agg([\"mean\",\"sum\"])\n", + "te = time()\n", + "s[\"aggregation\"] = te-ts\n", + "\n", + "ts = time()\n", + "dff.sort_values(by=[\"card1\",\"addr1\",\"D9\"], inplace=True)\n", + "dff.sort_values(by=[\"addr1\",\"D9\",\"card1\"], inplace=True)\n", + "dff.sort_values(by=[\"D9\",\"card1\",\"addr1\"], inplace=True)\n", + "te = time()\n", + "s[\"sorting\"] = te-ts" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T22:56:19.519391Z", + "start_time": "2020-11-17T22:56:19.472412Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pandas
aggregation0.075788
load_identity0.682109
load_transactions18.279765
merge3.196074
sorting2.224164
\n", + "
" + ], + "text/plain": [ + " pandas\n", + "aggregation 0.075788\n", + "load_identity 0.682109\n", + "load_transactions 18.279765\n", + "merge 3.196074\n", + "sorting 2.224164" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(stats)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "dff.to_pickle(\"data/dff.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(4553, 2)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Because julia groups by including N\\A, let's just check that number of groups matches\n", + "grp = dff[[\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\",\"TransactionAmt\"]].fillna(\"~U~\")\\\n", + ".groupby([\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\"])[\"TransactionAmt\"].agg([\"mean\",\"sum\"])\n", + "grp.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T22:56:19.550559Z", + "start_time": "2020-11-17T22:56:19.524963Z" + } + }, + "outputs": [], + "source": [ + "def clean(wait_time: int=15):\n", + " \"\"\"Cleans created DataFrames and call the garbage collector to actions. Wait for 15s by default\"\"\"\n", + " df, df2, dff, grp = None, None, None, None\n", + " gc.collect()\n", + " sleep(wait_time)\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T22:56:34.718818Z", + "start_time": "2020-11-17T22:56:19.559830Z" + } + }, + "outputs": [], + "source": [ + "clean()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T22:56:34.813222Z", + "start_time": "2020-11-17T22:56:34.780299Z" + } + }, + "outputs": [], + "source": [ + "def list_variables_memory_usage() -> dict:\n", + " \"\"\"Memory of existing local variables\"\"\"\n", + " local_vars = list(locals().items())\n", + " return {var: sys.getsizeof(obj) for var, obj in local_vars}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Dask\n", + "When to use dask - https://docs.dask.org/en/latest/dataframe.html#common-uses-and-anti-uses" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T22:57:00.280745Z", + "start_time": "2020-11-17T22:56:34.844985Z" + } + }, + "outputs": [], + "source": [ + "stats[\"dask\"] = {}\n", + "s = stats[\"dask\"]\n", + "\n", + "ts = time()\n", + "df = dd.read_csv(paths[0])\n", + "te = time()\n", + "s[\"load_transactions\"] = te-ts\n", + "\n", + "ts = time()\n", + "df2 = dd.read_csv(paths[1])\n", + "te = time()\n", + "s[\"load_identity\"] = te-ts\n", + "\n", + "ts = time()\n", + "dff = df.merge(df2, on=\"TransactionID\")\n", + "te = time()\n", + "s[\"merge\"] = te-ts\n", + "\n", + "# the difference is that we call compute method, which runs all the computations at this point\n", + "ts = time()\n", + "grp = dff.groupby([\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\"])[\"TransactionAmt\"]\\\n", + " .agg([\"mean\",\"sum\"])\\\n", + " .compute()\n", + "te = time()\n", + "s[\"aggregation\"] = te-ts\n", + "\n", + "# parallel soring is tricky that is why there are only work arounds in dask. \n", + "ts = time()\n", + "dff.set_index(\"card1\").compute()\n", + "te = time()\n", + "s[\"sorting\"] = te-ts\n" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T22:57:15.539155Z", + "start_time": "2020-11-17T22:57:00.286799Z" + } + }, + "outputs": [], + "source": [ + "clean()" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T22:57:15.580380Z", + "start_time": "2020-11-17T22:57:15.546567Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pandasdask
load_transactions18.2797650.083901
load_identity0.6821090.028268
merge3.1960740.073891
aggregation0.07578820.837958
sorting2.22416471.282675
\n", + "
" + ], + "text/plain": [ + " pandas dask\n", + "load_transactions 18.279765 0.083901\n", + "load_identity 0.682109 0.028268\n", + "merge 3.196074 0.073891\n", + "aggregation 0.075788 20.837958\n", + "sorting 2.224164 71.282675" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(stats)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "stats[\"dask_indexed\"] = {}\n", + "s = stats[\"dask_indexed\"]\n", + "\n", + "ts = time()\n", + "df = dd.read_csv(paths[0]).set_index(\"TransactionID\")\n", + "te = time()\n", + "s[\"load_transactions\"] = te-ts\n", + "\n", + "ts = time()\n", + "df2 = dd.read_csv(paths[1]).set_index(\"TransactionID\")\n", + "te = time()\n", + "s[\"load_identity\"] = te-ts\n", + "\n", + "ts = time()\n", + "dff = df.merge(df2, left_index=True, right_index=True)\n", + "te = time()\n", + "s[\"merge\"] = te-ts\n", + "\n", + "# the difference is that we call compute method, which runs all the computations at this point\n", + "ts = time()\n", + "grp = dff.groupby([\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\"])[\"TransactionAmt\"]\\\n", + " .agg([\"mean\",\"sum\"])\\\n", + " .compute()\n", + "te = time()\n", + "s[\"aggregation\"] = te-ts\n", + "\n", + "# parallel soring is tricky that is why there are only work arounds in dask. \n", + "ts = time()\n", + "dff.set_index(\"card1\").compute()\n", + "te = time()\n", + "s[\"sorting\"] = te-ts\n" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pandasdaskdask_indexedvaex
load_transactions18.2797650.08390114.930128NaN
load_identity0.6821090.0282680.761821NaN
merge3.1960740.0738910.078762NaN
aggregation0.07578820.83795823.130105NaN
sorting2.22416471.28267575.393628NaN
\n", + "
" + ], + "text/plain": [ + " pandas dask dask_indexed vaex\n", + "load_transactions 18.279765 0.083901 14.930128 NaN\n", + "load_identity 0.682109 0.028268 0.761821 NaN\n", + "merge 3.196074 0.073891 0.078762 NaN\n", + "aggregation 0.075788 20.837958 23.130105 NaN\n", + "sorting 2.224164 71.282675 75.393628 NaN" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clean()\n", + "pd.DataFrame(stats)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T22:29:19.438715Z", + "start_time": "2020-11-17T22:29:19.429209Z" + } + }, + "source": [ + "# Vaex" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T23:03:06.560013Z", + "start_time": "2020-11-17T23:03:06.545427Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'vaex-core': '2.0.3',\n", + " 'vaex-viz': '0.4.0',\n", + " 'vaex-hdf5': '0.6.0',\n", + " 'vaex-server': '0.3.1',\n", + " 'vaex-astro': '0.7.0',\n", + " 'vaex-jupyter': '0.5.2',\n", + " 'vaex-ml': '0.9.0',\n", + " 'vaex-arrow': '0.5.1'}" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vaex.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T23:04:01.165275Z", + "start_time": "2020-11-17T23:03:06.562006Z" + } + }, + "outputs": [], + "source": [ + "tool = \"vaex\"\n", + "stats[tool] = {}\n", + "s = stats[tool]\n", + "\n", + "\n", + "ts = time()\n", + "df = vaex.open(paths[0])\n", + "te = time()\n", + "s[\"load_transactions\"] = te-ts\n", + "\n", + "ts = time()\n", + "df2 = vaex.open(paths[1])\n", + "te = time()\n", + "s[\"load_identity\"] = te-ts\n" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T23:04:01.642707Z", + "start_time": "2020-11-17T23:04:01.176085Z" + } + }, + "outputs": [], + "source": [ + "ts = time()\n", + "dff = df.join(df2, on=\"TransactionID\")\n", + "te = time()\n", + "s[\"merge\"] = te-ts" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T23:04:03.395316Z", + "start_time": "2020-11-17T23:04:01.645742Z" + } + }, + "outputs": [], + "source": [ + "# the difference is that we call compute method, which runs all the computations at this point\n", + "ts = time()\n", + "grp = dff.groupby([dff[\"isFraud\"],dff[\"ProductCD\"],dff[\"card4\"],dff[\"card6\"],dff[\"id_15\"],dff[\"id_31\"]], \n", + " agg=[vaex.agg.mean('TransactionAmt'), vaex.agg.sum('TransactionAmt')])\n", + "te = time()\n", + "s[\"aggregation\"] = te-ts\n" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [], + "source": [ + "# the difference is that we call compute method, which runs all the computations at this point\n", + "ts = time()\n", + "dff_s = dff.sort(by=[\"card1\",\"addr1\",\"D9\"])\n", + "dff_s = dff.sort(by=[\"addr1\",\"D9\",\"card1\"])\n", + "dff_s = dff.sort(by=[\"D9\",\"card1\",\"addr1\"])\n", + "te = time()\n", + "s[\"sorting\"] = te-ts" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T23:04:03.469428Z", + "start_time": "2020-11-17T23:04:03.423857Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pandasdaskdask_indexedvaex
load_transactions18.2797650.08390114.93012818.734002
load_identity0.6821090.0282680.7618211.023915
merge3.1960740.0738910.0787620.131490
aggregation0.07578820.83795823.1301050.383996
sorting2.22416471.28267575.3936281.035000
sortNaNNaNNaN0.329828
\n", + "
" + ], + "text/plain": [ + " pandas dask dask_indexed vaex\n", + "load_transactions 18.279765 0.083901 14.930128 18.734002\n", + "load_identity 0.682109 0.028268 0.761821 1.023915\n", + "merge 3.196074 0.073891 0.078762 0.131490\n", + "aggregation 0.075788 20.837958 23.130105 0.383996\n", + "sorting 2.224164 71.282675 75.393628 1.035000\n", + "sort NaN NaN NaN 0.329828" + ] + }, + "execution_count": 113, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(stats)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T23:04:18.930053Z", + "start_time": "2020-11-17T23:04:03.543914Z" + } + }, + "outputs": [], + "source": [ + "clean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PySpark" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T23:04:27.010489Z", + "start_time": "2020-11-17T23:04:18.932048Z" + } + }, + "outputs": [], + "source": [ + "from pyspark import SparkContext\n", + "sc = SparkContext()\n", + "sc.version\n", + "sc.stop()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T23:04:29.416261Z", + "start_time": "2020-11-17T23:04:27.011485Z" + } + }, + "outputs": [], + "source": [ + "# Create my_spark\n", + "my_spark = SparkSession.builder \\\n", + " .master(\"local\") \\\n", + " .appName(\"Pandas Alternative\") \\\n", + " .config(\"spark.some.config.option\", \"some-value\") \\\n", + " .getOrCreate()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T23:05:33.528531Z", + "start_time": "2020-11-17T23:04:29.419253Z" + } + }, + "outputs": [], + "source": [ + "tool = \"spark\"\n", + "stats[tool] = {}\n", + "s = stats[tool]\n", + "\n", + "\n", + "ts = time()\n", + "df = my_spark.read.csv(paths[0],inferSchema = True,header= True) \n", + "te = time()\n", + "s[\"load_transactions\"] = te-ts\n", + "\n", + "ts = time()\n", + "df2 = my_spark.read.csv(paths[1],inferSchema = True,header= True) \n", + "te = time()\n", + "s[\"load_identity\"] = te-ts" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T23:05:33.849489Z", + "start_time": "2020-11-17T23:05:33.534687Z" + } + }, + "outputs": [], + "source": [ + "\n", + "ts = time()\n", + "dff = df.join(df2, \"TransactionID\")\n", + "te = time()\n", + "s[\"merge\"] = te-ts" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T23:06:01.148952Z", + "start_time": "2020-11-17T23:05:33.851490Z" + } + }, + "outputs": [], + "source": [ + "# the difference is that we call collect method, which runs all the computations at this point\n", + "#ts = time()\n", + "#grp = dff.groupby([dff[\"isFraud\"],dff[\"ProductCD\"],dff[\"card4\"],dff[\"card6\"],dff[\"id_15\"],dff[\"id_31\"]]) \\\n", + "# .agg(avg(\"TransactionAmt\"), sum(\"TransactionAmt\"))\\\n", + "# .collect()\n", + "#te = time()\n", + "#s[\"aggregation\"] = te-ts\n", + "#s[\"all\"] = te-tss" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T23:13:24.332254Z", + "start_time": "2020-11-17T23:13:03.641149Z" + } + }, + "outputs": [], + "source": [ + "# the difference is that we call collect method, which runs all the computations at this point\n", + "ts = time()\n", + "grp = dff.groupby([\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\"]) \\\n", + " .agg(avg(\"TransactionAmt\"), sum(\"TransactionAmt\"))\\\n", + " .collect()\n", + "te = time()\n", + "s[\"aggregation\"] = te-ts\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T23:06:25.157340Z", + "start_time": "2020-11-17T23:06:25.118349Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pandas
aggregation0.060114
all19.908346
load_identity0.480164
load_transactions17.354527
merge2.013150
Total39.816302
\n", + "
" + ], + "text/plain": [ + " pandas\n", + "aggregation 0.060114\n", + "all 19.908346\n", + "load_identity 0.480164\n", + "load_transactions 17.354527\n", + "merge 2.013150\n", + "Total 39.816302" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "stats_df = pd.DataFrame(stats)\n", + "stats_df.loc['Total'] = stats_df.sum()\n", + "stats_df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T22:16:19.569545Z", + "start_time": "2020-11-17T22:16:19.559625Z" + } + }, + "source": [ + "# Modin" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T22:57:15.609009Z", + "start_time": "2020-11-17T22:57:15.586070Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'0.8.2'" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mpd.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T23:00:06.292260Z", + "start_time": "2020-11-17T22:58:42.702035Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", + "distributed.nanny - WARNING - Restarting worker\n", + "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", + "distributed.nanny - WARNING - Restarting worker\n", + "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", + "distributed.nanny - WARNING - Restarting worker\n", + "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", + "distributed.nanny - WARNING - Restarting worker\n", + "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", + "distributed.nanny - WARNING - Restarting worker\n", + "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n" + ] + }, + { + "ename": "KilledWorker", + "evalue": "('lambda-dc847cac2df298f0ded2b3e426e3824d', )", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKilledWorker\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[0mts\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 7\u001b[1;33m \u001b[0mdf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpaths\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 8\u001b[0m \u001b[0mte\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[0ms\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"load_transactions\"\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mte\u001b[0m\u001b[1;33m-\u001b[0m\u001b[0mts\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\modin\\pandas\\io.py\u001b[0m in \u001b[0;36mparser_func\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, skipfooter, doublequote, delim_whitespace, low_memory, memory_map, float_precision)\u001b[0m\n\u001b[0;32m 107\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"sep\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msep\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 108\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"sep\"\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"\\t\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 109\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 110\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 111\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mparser_func\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\modin\\pandas\\io.py\u001b[0m in \u001b[0;36m_read\u001b[1;34m(**kwargs)\u001b[0m\n\u001b[0;32m 125\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mmodin\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdata_management\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfactories\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdispatcher\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mEngineDispatcher\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 126\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 127\u001b[1;33m \u001b[0mpd_obj\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mEngineDispatcher\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 128\u001b[0m \u001b[1;31m# This happens when `read_csv` returns a TextFileReader object for iterating through\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 129\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpd_obj\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpandas\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mio\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparsers\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mTextFileReader\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\modin\\data_management\\factories\\dispatcher.py\u001b[0m in \u001b[0;36mread_csv\u001b[1;34m(cls, **kwargs)\u001b[0m\n\u001b[0;32m 102\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mclassmethod\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 103\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mread_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcls\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 104\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mcls\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__engine\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_read_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 105\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 106\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mclassmethod\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\modin\\data_management\\factories\\factories.py\u001b[0m in \u001b[0;36m_read_csv\u001b[1;34m(cls, **kwargs)\u001b[0m\n\u001b[0;32m 85\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mclassmethod\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 86\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_read_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcls\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 87\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mcls\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mio_cls\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 88\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 89\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mclassmethod\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\modin\\engines\\base\\io\\file_reader.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(cls, *args, **kwargs)\u001b[0m\n\u001b[0;32m 27\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mclassmethod\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 28\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcls\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 29\u001b[1;33m \u001b[0mquery_compiler\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcls\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 30\u001b[0m \u001b[1;31m# TODO (devin-petersohn): Make this section more general for non-pandas kernel\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 31\u001b[0m \u001b[1;31m# implementations.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\modin\\engines\\base\\io\\text\\csv_reader.py\u001b[0m in \u001b[0;36m_read\u001b[1;34m(cls, filepath_or_buffer, **kwargs)\u001b[0m\n\u001b[0;32m 174\u001b[0m \u001b[1;31m# or based on the column(s) that were requested.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 175\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mindex_col\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 176\u001b[1;33m \u001b[0mrow_lengths\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcls\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmaterialize\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindex_ids\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 177\u001b[0m \u001b[0mnew_index\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpandas\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mRangeIndex\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrow_lengths\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 178\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\modin\\engines\\dask\\task_wrapper.py\u001b[0m in \u001b[0;36mmaterialize\u001b[1;34m(cls, future)\u001b[0m\n\u001b[0;32m 28\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mmaterialize\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcls\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfuture\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 29\u001b[0m \u001b[0mclient\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_get_global_client\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 30\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mclient\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgather\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfuture\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\distributed\\client.py\u001b[0m in \u001b[0;36mgather\u001b[1;34m(self, futures, errors, direct, asynchronous)\u001b[0m\n\u001b[0;32m 1990\u001b[0m \u001b[0mdirect\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdirect\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1991\u001b[0m \u001b[0mlocal_worker\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mlocal_worker\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1992\u001b[1;33m \u001b[0masynchronous\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0masynchronous\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1993\u001b[0m )\n\u001b[0;32m 1994\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\distributed\\client.py\u001b[0m in \u001b[0;36msync\u001b[1;34m(self, func, asynchronous, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[0;32m 831\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 832\u001b[0m return sync(\n\u001b[1;32m--> 833\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloop\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcallback_timeout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcallback_timeout\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 834\u001b[0m )\n\u001b[0;32m 835\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\distributed\\utils.py\u001b[0m in \u001b[0;36msync\u001b[1;34m(loop, func, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[0;32m 338\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0merror\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 339\u001b[0m \u001b[0mtyp\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtb\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0merror\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 340\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 341\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 342\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\distributed\\utils.py\u001b[0m in \u001b[0;36mf\u001b[1;34m()\u001b[0m\n\u001b[0;32m 322\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mcallback_timeout\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 323\u001b[0m \u001b[0mfuture\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0masyncio\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwait_for\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfuture\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcallback_timeout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 324\u001b[1;33m \u001b[0mresult\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32myield\u001b[0m \u001b[0mfuture\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 325\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 326\u001b[0m \u001b[0merror\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\tornado\\gen.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 733\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 734\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 735\u001b[1;33m \u001b[0mvalue\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfuture\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 736\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 737\u001b[0m \u001b[0mexc_info\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\distributed\\client.py\u001b[0m in \u001b[0;36m_gather\u001b[1;34m(self, futures, errors, direct, local_worker)\u001b[0m\n\u001b[0;32m 1849\u001b[0m \u001b[0mexc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mCancelledError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1850\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1851\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mexception\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtraceback\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1852\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1853\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0merrors\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m\"skip\"\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mKilledWorker\u001b[0m: ('lambda-dc847cac2df298f0ded2b3e426e3824d', )" + ] + } + ], + "source": [ + "tool = \"modin\"\n", + "stats[tool] = {}\n", + "s = stats[tool]\n", + "\n", + "\n", + "ts = time()\n", + "df = mpd.read_csv(paths[0])\n", + "te = time()\n", + "s[\"load_transactions\"] = te-ts\n", + "\n", + "ts = time()\n", + "df2 = mpd.read_csv(paths[1])\n", + "te = time()\n", + "s[\"load_identity\"] = te-ts\n", + "\n", + "ts = time()\n", + "dff = df.merge(df2, on=\"TransactionID\")\n", + "te = time()\n", + "s[\"merge\"] = te-ts\n", + "\n", + "# modin defaults to pandas for multiple column aggregation and then fails on KeyError, though the key is available\n", + "ts = time()\n", + "try:\n", + " grp = dff.groupby([\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\"])[\"TransactionAmt\"].agg([\"mean\",\"sum\"])\n", + "except Exception as e:\n", + " print(e)\n", + "te = time()\n", + "s[\"aggregation\"] = te-ts\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T22:58:20.241434Z", + "start_time": "2020-11-17T22:55:17.025Z" + } + }, + "outputs": [], + "source": [ + "pd.DataFrame(stats)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T23:01:01.211911Z", + "start_time": "2020-11-17T23:00:45.850513Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", + "distributed.nanny - WARNING - Restarting worker\n", + "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", + "distributed.nanny - WARNING - Restarting worker\n", + "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n", + "distributed.nanny - WARNING - Restarting worker\n" + ] + } + ], + "source": [ + "clean()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "big-tables", + "language": "python", + "name": "big-tables" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/DataFrames/Julia.ipynb b/DataFrames/Julia.ipynb new file mode 100644 index 0000000..7d7eeb8 --- /dev/null +++ b/DataFrames/Julia.ipynb @@ -0,0 +1,593 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-15T23:25:27.615000+01:00", + "start_time": "2020-11-15T22:25:00.718Z" + } + }, + "outputs": [], + "source": [ + "using CSV\n", + "using DataFrames\n", + "using Dates\n", + "using Statistics" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "v\"1.4.1\"" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Julia version\n", + "VERSION" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-15T23:25:31.536000+01:00", + "start_time": "2020-11-15T22:25:00.722Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "2-element Array{String,1}:\n", + " \"dff.pkl\"\n", + " \"sales_data_sample.csv\"" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readdir(\"data\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2-element Array{String,1}:\n", + " \"train_transaction.csv\"\n", + " \"train_identity.csv\"" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "folder = \"/home/vaclav/Data/Kaggle/EEE-CIS_Fraud_Detection\"\n", + "files = [\"train_transaction.csv\", \"train_identity.csv\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"/home/vaclav/Data/Kaggle/EEE-CIS_Fraud_Detection/train_transaction.csv\"" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "joinpath(folder,files[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dict{Any,Any} with 5 entries:\n", + " \"merge\" => 0.771\n", + " \"sort\" => 5.032\n", + " \"load_transactions\" => 8.045\n", + " \"aggregation\" => 0.034\n", + " \"load_identity\" => 0.502" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "s = Dict()\n", + "\n", + "# load transactions ~600MB\n", + "ts = now()\n", + "df = CSV.read(joinpath(folder,files[1]), DataFrame)\n", + "te = now()\n", + "time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)\n", + "push!(s, \"load_transactions\"=>time_in_sec)\n", + "\n", + "# load identity ~25MB\n", + "ts = now()\n", + "df2 = CSV.read(joinpath(folder,files[2]), DataFrame)\n", + "te = now()\n", + "time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)\n", + "push!(s, \"load_identity\"=>time_in_sec)\n", + "\n", + "# join\n", + "ts = now()\n", + "dff = join(df, df2, kind = :inner, on = \"TransactionID\")\n", + "te = now()\n", + "time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)\n", + "push!(s, \"merge\"=>time_in_sec)\n", + "\n", + "# group by\n", + "ts = now()\n", + "grp = combine(groupby(dff, [\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\"]), \n", + " :TransactionAmt=>maximum=>:TransactionAmountMax, \n", + " :TransactionAmt=>mean=>:TransactionAmountMean)\n", + "te = now()\n", + "time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)\n", + "push!(s, \"aggregation\"=>time_in_sec)\n", + "\n", + "# group by\n", + "ts = now()\n", + "sort!(dff, [\"card1\",\"addr1\",\"D9\"])\n", + "sort!(dff, [\"addr1\",\"D9\",\"card1\"])\n", + "sort!(dff, [\"D9\",\"card1\",\"addr1\"])\n", + "te = now()\n", + "time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)\n", + "push!(s, \"sort\"=>time_in_sec)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

1 rows × 5 columns

aggregationload_identityload_transactionsmergesort
Float64Float64Float64Float64Float64
10.0340.5028.0450.7715.032
" + ], + "text/latex": [ + "\\begin{tabular}{r|ccccc}\n", + "\t& aggregation & load\\_identity & load\\_transactions & merge & sort\\\\\n", + "\t\\hline\n", + "\t& Float64 & Float64 & Float64 & Float64 & Float64\\\\\n", + "\t\\hline\n", + "\t1 & 0.034 & 0.502 & 8.045 & 0.771 & 5.032 \\\\\n", + "\\end{tabular}\n" + ], + "text/plain": [ + "1×5 DataFrame\n", + "│ Row │ aggregation │ load_identity │ load_transactions │ merge │ sort │\n", + "│ │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", + "├─────┼─────────────┼───────────────┼───────────────────┼─────────┼─────────┤\n", + "│ 1 │ 0.034 │ 0.502 │ 8.045 │ 0.771 │ 5.032 │" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "DataFrame(s)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(144233, 434, 4553, 8)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check the shape of the dataframes\n", + "nrow(dff), length(names(dff)), nrow(grp), length(names(grp))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Groupby Details\n", + "https://dataframes.juliadata.org/stable/man/split_apply_combine/" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

GroupedDataFrame with 4553 groups based on keys: isFraud, ProductCD, card4, card6, id_15, id_31

First Group (136 rows): isFraud = 0, ProductCD = \"H\", card4 = \"visa\", card6 = \"debit\", id_15 = \"Found\", id_31 = \"firefox 57.0\"

TransactionIDisFraudTransactionDTTransactionAmtProductCDcard1card2
Int64Int64Int64Float64StringInt64Float64?
1306745301729328200.0H1030157.0
2307329601816710100.0H1675174.0
3306187201642116100.0H1974111.0
4307893001902540100.0H6697111.0
530387880121159275.0H7508321.0
630563360155646325.0H9500321.0
73026050095188250.0H9500321.0
82999258034827450.0H10680373.0
92995902026011640.0H12526381.0
1031244440276776950.0H12839321.0
1130918780207747840.0H2884490.0
123011076060887850.0H7508321.0
13317094404067304100.0H7664490.0
14317095004067403100.0H7664490.0
15317095104067438100.0H7664490.0
16314513303289750150.0H9112250.0
17303487401130093300.0H10294555.0
1831708450406530635.0H12695490.0
1931141690251638730.0H5822555.0
2032646480674664540.0H15497490.0
2130437040130488925.0H16993555.0
2231917040467875225.0H1323268.0
23309262002087380100.0H2772512.0
243023170087658150.0H17188321.0
2530629210165400550.0H17496554.0
2630989620217677675.0H12501490.0
2730990330217772775.0H12501490.0
2830395960122600550.0H14084257.0
2930573830157094025.0H15377555.0
3030440250131140825.0H15497490.0

Last Group (1 row): isFraud = 1, ProductCD = \"S\", card4 = \"visa\", card6 = \"credit\", id_15 = \"New\", id_31 = \"mobile safari 11.0\"

TransactionIDisFraudTransactionDTTransactionAmtProductCDcard1card2
Int64Int64Int64Float64StringInt64Float64?
132166931543956325.0S18375174.0
" + ], + "text/latex": [ + "GroupedDataFrame with 4553 groups based on keys: isFraud, ProductCD, card4, card6, id\\_15, id\\_31\n", + "\n", + "First Group (136 rows): isFraud = 0, ProductCD = \"H\", card4 = \"visa\", card6 = \"debit\", id\\_15 = \"Found\", id\\_31 = \"firefox 57.0\"\n", + "\n", + "\\begin{tabular}{r|cccccccc}\n", + "\t& TransactionID & isFraud & TransactionDT & TransactionAmt & ProductCD & card1 & card2 & \\\\\n", + "\t\\hline\n", + "\t& Int64 & Int64 & Int64 & Float64 & String & Int64 & Float64? & \\\\\n", + "\t\\hline\n", + "\t1 & 3067453 & 0 & 1729328 & 200.0 & H & 1030 & 157.0 & $\\dots$ \\\\\n", + "\t2 & 3073296 & 0 & 1816710 & 100.0 & H & 1675 & 174.0 & $\\dots$ \\\\\n", + "\t3 & 3061872 & 0 & 1642116 & 100.0 & H & 1974 & 111.0 & $\\dots$ \\\\\n", + "\t4 & 3078930 & 0 & 1902540 & 100.0 & H & 6697 & 111.0 & $\\dots$ \\\\\n", + "\t5 & 3038788 & 0 & 1211592 & 75.0 & H & 7508 & 321.0 & $\\dots$ \\\\\n", + "\t6 & 3056336 & 0 & 1556463 & 25.0 & H & 9500 & 321.0 & $\\dots$ \\\\\n", + "\t7 & 3026050 & 0 & 951882 & 50.0 & H & 9500 & 321.0 & $\\dots$ \\\\\n", + "\t8 & 2999258 & 0 & 348274 & 50.0 & H & 10680 & 373.0 & $\\dots$ \\\\\n", + "\t9 & 2995902 & 0 & 260116 & 40.0 & H & 12526 & 381.0 & $\\dots$ \\\\\n", + "\t10 & 3124444 & 0 & 2767769 & 50.0 & H & 12839 & 321.0 & $\\dots$ \\\\\n", + "\t11 & 3091878 & 0 & 2077478 & 40.0 & H & 2884 & 490.0 & $\\dots$ \\\\\n", + "\t12 & 3011076 & 0 & 608878 & 50.0 & H & 7508 & 321.0 & $\\dots$ \\\\\n", + "\t13 & 3170944 & 0 & 4067304 & 100.0 & H & 7664 & 490.0 & $\\dots$ \\\\\n", + "\t14 & 3170950 & 0 & 4067403 & 100.0 & H & 7664 & 490.0 & $\\dots$ \\\\\n", + "\t15 & 3170951 & 0 & 4067438 & 100.0 & H & 7664 & 490.0 & $\\dots$ \\\\\n", + "\t16 & 3145133 & 0 & 3289750 & 150.0 & H & 9112 & 250.0 & $\\dots$ \\\\\n", + "\t17 & 3034874 & 0 & 1130093 & 300.0 & H & 10294 & 555.0 & $\\dots$ \\\\\n", + "\t18 & 3170845 & 0 & 4065306 & 35.0 & H & 12695 & 490.0 & $\\dots$ \\\\\n", + "\t19 & 3114169 & 0 & 2516387 & 30.0 & H & 5822 & 555.0 & $\\dots$ \\\\\n", + "\t20 & 3264648 & 0 & 6746645 & 40.0 & H & 15497 & 490.0 & $\\dots$ \\\\\n", + "\t21 & 3043704 & 0 & 1304889 & 25.0 & H & 16993 & 555.0 & $\\dots$ \\\\\n", + "\t22 & 3191704 & 0 & 4678752 & 25.0 & H & 1323 & 268.0 & $\\dots$ \\\\\n", + "\t23 & 3092620 & 0 & 2087380 & 100.0 & H & 2772 & 512.0 & $\\dots$ \\\\\n", + "\t24 & 3023170 & 0 & 876581 & 50.0 & H & 17188 & 321.0 & $\\dots$ \\\\\n", + "\t25 & 3062921 & 0 & 1654005 & 50.0 & H & 17496 & 554.0 & $\\dots$ \\\\\n", + "\t26 & 3098962 & 0 & 2176776 & 75.0 & H & 12501 & 490.0 & $\\dots$ \\\\\n", + "\t27 & 3099033 & 0 & 2177727 & 75.0 & H & 12501 & 490.0 & $\\dots$ \\\\\n", + "\t28 & 3039596 & 0 & 1226005 & 50.0 & H & 14084 & 257.0 & $\\dots$ \\\\\n", + "\t29 & 3057383 & 0 & 1570940 & 25.0 & H & 15377 & 555.0 & $\\dots$ \\\\\n", + "\t30 & 3044025 & 0 & 1311408 & 25.0 & H & 15497 & 490.0 & $\\dots$ \\\\\n", + "\t$\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & \\\\\n", + "\\end{tabular}\n", + "\n", + "$\\dots$\n", + "\n", + "Last Group (1 row): isFraud = 1, ProductCD = \"S\", card4 = \"visa\", card6 = \"credit\", id\\_15 = \"New\", id\\_31 = \"mobile safari 11.0\"\n", + "\n", + "\\begin{tabular}{r|cccccccc}\n", + "\t& TransactionID & isFraud & TransactionDT & TransactionAmt & ProductCD & card1 & card2 & \\\\\n", + "\t\\hline\n", + "\t& Int64 & Int64 & Int64 & Float64 & String & Int64 & Float64? & \\\\\n", + "\t\\hline\n", + "\t1 & 3216693 & 1 & 5439563 & 25.0 & S & 18375 & 174.0 & $\\dots$ \\\\\n", + "\\end{tabular}\n" + ], + "text/plain": [ + "GroupedDataFrame with 4553 groups based on keys: isFraud, ProductCD, card4, card6, id_15, id_31\n", + "First Group (136 rows): isFraud = 0, ProductCD = \"H\", card4 = \"visa\", card6 = \"debit\", id_15 = \"Found\", id_31 = \"firefox 57.0\". Omitted printing of 429 columns\n", + "│ Row │ TransactionID │ isFraud │ TransactionDT │ TransactionAmt │ ProductCD │\n", + "│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mString\u001b[39m │\n", + "├─────┼───────────────┼─────────┼───────────────┼────────────────┼───────────┤\n", + "│ 1 │ 3067453 │ 0 │ 1729328 │ 200.0 │ H │\n", + "│ 2 │ 3073296 │ 0 │ 1816710 │ 100.0 │ H │\n", + "│ 3 │ 3061872 │ 0 │ 1642116 │ 100.0 │ H │\n", + "│ 4 │ 3078930 │ 0 │ 1902540 │ 100.0 │ H │\n", + "│ 5 │ 3038788 │ 0 │ 1211592 │ 75.0 │ H │\n", + "│ 6 │ 3056336 │ 0 │ 1556463 │ 25.0 │ H │\n", + "│ 7 │ 3026050 │ 0 │ 951882 │ 50.0 │ H │\n", + "│ 8 │ 2999258 │ 0 │ 348274 │ 50.0 │ H │\n", + "│ 9 │ 2995902 │ 0 │ 260116 │ 40.0 │ H │\n", + "│ 10 │ 3124444 │ 0 │ 2767769 │ 50.0 │ H │\n", + "⋮\n", + "│ 126 │ 3002736 │ 0 │ 426917 │ 25.0 │ H │\n", + "│ 127 │ 3066817 │ 0 │ 1723040 │ 50.0 │ H │\n", + "│ 128 │ 3096864 │ 0 │ 2154200 │ 25.0 │ H │\n", + "│ 129 │ 3097062 │ 0 │ 2156024 │ 50.0 │ H │\n", + "│ 130 │ 2998791 │ 0 │ 341460 │ 75.0 │ H │\n", + "│ 131 │ 3090745 │ 0 │ 2067079 │ 30.0 │ H │\n", + "│ 132 │ 3022302 │ 0 │ 861372 │ 50.0 │ H │\n", + "│ 133 │ 3056137 │ 0 │ 1553999 │ 125.0 │ H │\n", + "│ 134 │ 3085711 │ 0 │ 1988068 │ 30.0 │ H │\n", + "│ 135 │ 3099225 │ 0 │ 2180952 │ 50.0 │ H │\n", + "│ 136 │ 3051141 │ 0 │ 1453322 │ 75.0 │ H │\n", + "⋮\n", + "Last Group (1 row): isFraud = 1, ProductCD = \"S\", card4 = \"visa\", card6 = \"credit\", id_15 = \"New\", id_31 = \"mobile safari 11.0\". Omitted printing of 429 columns\n", + "│ Row │ TransactionID │ isFraud │ TransactionDT │ TransactionAmt │ ProductCD │\n", + "│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mString\u001b[39m │\n", + "├─────┼───────────────┼─────────┼───────────────┼────────────────┼───────────┤\n", + "│ 1 │ 3216693 │ 1 │ 5439563 │ 25.0 │ S │" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "groupby(dff, [\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

4,553 rows × 8 columns

isFraudProductCDcard4card6id_15id_31TransactionAmountMaxTransactionAmountMean
Int64StringString?String?String?String?Float64Float64
10HvisadebitFoundfirefox 57.0300.071.1765
20RvisacreditFoundie 11.0 for desktop1000.0208.58
31RvisadebitFoundchrome 63.0 for android300.0216.667
40CvisacreditFoundchrome 65.0410.37349.3293
50RvisadebitFoundchrome 62.0 for android200.098.9583
61CmastercardcreditNewchrome 63.0225.50447.2223
70CmastercardcreditFoundchrome 62.0 for android154.07144.5909
80CmastercardcreditFoundchrome 63.0302.11150.8962
91CmastercardcreditFoundchrome 63.0265.49844.8802
100CmastercardcreditNewchrome 63.0302.11148.2015
110CmastercardcreditFoundsafari generic141.15839.4613
120CmastercardcreditUnknownsafari generic221.5451.347
130CmastercardcreditFoundmobile safari generic162.67653.6838
140RvisacreditNewchrome 64.0350.0138.889
150RvisacreditFoundchrome 64.0450.0153.112
160RvisacreditFoundchrome 65.0900.0177.778
170CmastercardcreditFoundchrome 64.0220.17150.7401
181CmastercardcreditFoundchrome 65.0 for android185.6753.1863
190CvisadebitFoundchrome 63.0422.55843.4618
200CvisadebitFoundchrome 64.0 for android268.26528.9167
210CvisadebitFoundfirefox 57.0204.63344.2996
220CvisadebitFoundchrome 65.0 for android230.07932.405
230CvisadebitFoundchrome 60.0 for android96.3731.5289
240CvisacreditFoundchrome 62.0412.76448.7786
250CmastercarddebitFoundchrome 64.0 for android162.95326.765
260CmastercarddebitFoundchrome 65.0 for android283.3732.5836
270RvisacreditFoundmobile safari generic1000.0167.918
280HvisacreditFoundchrome 62.0 for ios50.040.0
290HvisacreditFoundchrome 64.0450.096.2881
300SvisacreditNewchrome generic200.047.7391
" + ], + "text/latex": [ + "\\begin{tabular}{r|cccccccc}\n", + "\t& isFraud & ProductCD & card4 & card6 & id\\_15 & id\\_31 & TransactionAmountMax & TransactionAmountMean\\\\\n", + "\t\\hline\n", + "\t& Int64 & String & String? & String? & String? & String? & Float64 & Float64\\\\\n", + "\t\\hline\n", + "\t1 & 0 & H & visa & debit & Found & firefox 57.0 & 300.0 & 71.1765 \\\\\n", + "\t2 & 0 & R & visa & credit & Found & ie 11.0 for desktop & 1000.0 & 208.58 \\\\\n", + "\t3 & 1 & R & visa & debit & Found & chrome 63.0 for android & 300.0 & 216.667 \\\\\n", + "\t4 & 0 & C & visa & credit & Found & chrome 65.0 & 410.373 & 49.3293 \\\\\n", + "\t5 & 0 & R & visa & debit & Found & chrome 62.0 for android & 200.0 & 98.9583 \\\\\n", + "\t6 & 1 & C & mastercard & credit & New & chrome 63.0 & 225.504 & 47.2223 \\\\\n", + "\t7 & 0 & C & mastercard & credit & Found & chrome 62.0 for android & 154.071 & 44.5909 \\\\\n", + "\t8 & 0 & C & mastercard & credit & Found & chrome 63.0 & 302.111 & 50.8962 \\\\\n", + "\t9 & 1 & C & mastercard & credit & Found & chrome 63.0 & 265.498 & 44.8802 \\\\\n", + "\t10 & 0 & C & mastercard & credit & New & chrome 63.0 & 302.111 & 48.2015 \\\\\n", + "\t11 & 0 & C & mastercard & credit & Found & safari generic & 141.158 & 39.4613 \\\\\n", + "\t12 & 0 & C & mastercard & credit & Unknown & safari generic & 221.54 & 51.347 \\\\\n", + "\t13 & 0 & C & mastercard & credit & Found & mobile safari generic & 162.676 & 53.6838 \\\\\n", + "\t14 & 0 & R & visa & credit & New & chrome 64.0 & 350.0 & 138.889 \\\\\n", + "\t15 & 0 & R & visa & credit & Found & chrome 64.0 & 450.0 & 153.112 \\\\\n", + "\t16 & 0 & R & visa & credit & Found & chrome 65.0 & 900.0 & 177.778 \\\\\n", + "\t17 & 0 & C & mastercard & credit & Found & chrome 64.0 & 220.171 & 50.7401 \\\\\n", + "\t18 & 1 & C & mastercard & credit & Found & chrome 65.0 for android & 185.67 & 53.1863 \\\\\n", + "\t19 & 0 & C & visa & debit & Found & chrome 63.0 & 422.558 & 43.4618 \\\\\n", + "\t20 & 0 & C & visa & debit & Found & chrome 64.0 for android & 268.265 & 28.9167 \\\\\n", + "\t21 & 0 & C & visa & debit & Found & firefox 57.0 & 204.633 & 44.2996 \\\\\n", + "\t22 & 0 & C & visa & debit & Found & chrome 65.0 for android & 230.079 & 32.405 \\\\\n", + "\t23 & 0 & C & visa & debit & Found & chrome 60.0 for android & 96.37 & 31.5289 \\\\\n", + "\t24 & 0 & C & visa & credit & Found & chrome 62.0 & 412.764 & 48.7786 \\\\\n", + "\t25 & 0 & C & mastercard & debit & Found & chrome 64.0 for android & 162.953 & 26.765 \\\\\n", + "\t26 & 0 & C & mastercard & debit & Found & chrome 65.0 for android & 283.37 & 32.5836 \\\\\n", + "\t27 & 0 & R & visa & credit & Found & mobile safari generic & 1000.0 & 167.918 \\\\\n", + "\t28 & 0 & H & visa & credit & Found & chrome 62.0 for ios & 50.0 & 40.0 \\\\\n", + "\t29 & 0 & H & visa & credit & Found & chrome 64.0 & 450.0 & 96.2881 \\\\\n", + "\t30 & 0 & S & visa & credit & New & chrome generic & 200.0 & 47.7391 \\\\\n", + "\t$\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ \\\\\n", + "\\end{tabular}\n" + ], + "text/plain": [ + "4553×8 DataFrame. Omitted printing of 1 columns\n", + "│ Row │ isFraud │ ProductCD │ card4 │ card6 │ id_15 │ id_31 │ TransactionAmountMax │\n", + "│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mString?\u001b[39m │ \u001b[90mString?\u001b[39m │ \u001b[90mString?\u001b[39m │ \u001b[90mUnion{Missing, String}\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", + "├──────┼─────────┼───────────┼──────────────────┼─────────┼─────────┼─────────────────────────┼──────────────────────┤\n", + "│ 1 │ 0 │ H │ visa │ debit │ Found │ firefox 57.0 │ 300.0 │\n", + "│ 2 │ 0 │ R │ visa │ credit │ Found │ ie 11.0 for desktop │ 1000.0 │\n", + "│ 3 │ 1 │ R │ visa │ debit │ Found │ chrome 63.0 for android │ 300.0 │\n", + "│ 4 │ 0 │ C │ visa │ credit │ Found │ chrome 65.0 │ 410.373 │\n", + "│ 5 │ 0 │ R │ visa │ debit │ Found │ chrome 62.0 for android │ 200.0 │\n", + "│ 6 │ 1 │ C │ mastercard │ credit │ New │ chrome 63.0 │ 225.504 │\n", + "│ 7 │ 0 │ C │ mastercard │ credit │ Found │ chrome 62.0 for android │ 154.071 │\n", + "│ 8 │ 0 │ C │ mastercard │ credit │ Found │ chrome 63.0 │ 302.111 │\n", + "│ 9 │ 1 │ C │ mastercard │ credit │ Found │ chrome 63.0 │ 265.498 │\n", + "│ 10 │ 0 │ C │ mastercard │ credit │ New │ chrome 63.0 │ 302.111 │\n", + "⋮\n", + "│ 4543 │ 0 │ R │ mastercard │ credit │ New │ chrome 60.0 for android │ 100.0 │\n", + "│ 4544 │ 0 │ H │ \u001b[90mmissing\u001b[39m │ debit │ New │ mobile safari 11.0 │ 50.0 │\n", + "│ 4545 │ 0 │ R │ visa │ credit │ Unknown │ chrome 61.0 │ 100.0 │\n", + "│ 4546 │ 0 │ H │ visa │ credit │ New │ edge 17.0 │ 75.0 │\n", + "│ 4547 │ 0 │ H │ visa │ credit │ New │ mobile │ 50.0 │\n", + "│ 4548 │ 0 │ R │ american express │ credit │ New │ opera 49.0 │ 200.0 │\n", + "│ 4549 │ 0 │ R │ visa │ debit │ Found │ opera │ 100.0 │\n", + "│ 4550 │ 1 │ C │ mastercard │ credit │ New │ chrome 59.0 │ 205.682 │\n", + "│ 4551 │ 0 │ C │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ New │ samsung browser 6.4 │ 32.707 │\n", + "│ 4552 │ 1 │ R │ visa │ credit │ New │ chrome 64.0 for android │ 300.0 │\n", + "│ 4553 │ 1 │ S │ visa │ credit │ New │ mobile safari 11.0 │ 25.0 │" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ENV[\"COLUMNS\"]=120\n", + "# this function group by `missing` as well\n", + "combine(groupby(dff, [\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\"]), \n", + " :TransactionAmt=>maximum=>:TransactionAmountMax, \n", + " :TransactionAmt=>mean=>:TransactionAmountMean)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sorting Details" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

144,233 rows × 434 columns (omitted printing of 424 columns)

TransactionIDisFraudTransactionDTTransactionAmtProductCDcard1card2card3card4card5
Int64Int64Int64Float64StringInt64Float64?Float64?String?Float64?
132309240578741923.443C1000555.0185.0mastercard224.0
230207670842821150.0R1004583.0150.0visa226.0
330289730102217330.0H1004583.0150.0visa226.0
4338644401008248450.0H1004583.0150.0visa226.0
530388710121280250.0H1005543.0150.0mastercard117.0
6323468105883179150.0R1006399.0150.0american express146.0
73436647011468973100.0R1006399.0150.0american express146.0
8309568102145214150.0R1006399.0150.0american express146.0
93021401085073023.203C1007555.0135.0mastercard224.0
1032262410565117755.164C1007555.0135.0mastercard224.0
11303943901222657200.0R1009399.0150.0american express146.0
1230699430178601350.0H1009399.0150.0american express146.0
1330548660153900850.0H1009399.0150.0american express146.0
1430240780926293150.0R1009399.0150.0american express146.0
1529996160354228200.0R1009399.0150.0american express146.0
16337827909826142277.932C1010555.0121.0visa226.0
1733192210818398825.0H1011543.0150.0mastercard224.0
183015198070233150.0H1012479.0150.0visa162.0
1930082680562205100.0H1012479.0150.0visa162.0
20352412401415421675.0H1012479.0150.0visa162.0
21309818502166984175.0R1012479.0150.0visa162.0
22356993101557312119.92C1014555.0117.0visa226.0
23356314601530073643.651C1014555.0117.0visa226.0
243455404112076280450.0R1015555.0144.0mastercard224.0
25339395101027514123.564C1015555.0144.0mastercard224.0
2630688290175179163.6C1015555.0144.0mastercard224.0
27308008001919379108.205C1015555.0144.0mastercard224.0
2830936830212873131.132C1015555.0144.0mastercard224.0
2931598570373466622.037C1015555.0144.0mastercard224.0
3033316290849873922.96C1015555.0144.0mastercard224.0
" + ], + "text/latex": [ + "\\begin{tabular}{r|ccccccccccc}\n", + "\t& TransactionID & isFraud & TransactionDT & TransactionAmt & ProductCD & card1 & card2 & card3 & card4 & card5 & \\\\\n", + "\t\\hline\n", + "\t& Int64 & Int64 & Int64 & Float64 & String & Int64 & Float64? & Float64? & String? & Float64? & \\\\\n", + "\t\\hline\n", + "\t1 & 3230924 & 0 & 5787419 & 23.443 & C & 1000 & 555.0 & 185.0 & mastercard & 224.0 & $\\dots$ \\\\\n", + "\t2 & 3020767 & 0 & 842821 & 150.0 & R & 1004 & 583.0 & 150.0 & visa & 226.0 & $\\dots$ \\\\\n", + "\t3 & 3028973 & 0 & 1022173 & 30.0 & H & 1004 & 583.0 & 150.0 & visa & 226.0 & $\\dots$ \\\\\n", + "\t4 & 3386444 & 0 & 10082484 & 50.0 & H & 1004 & 583.0 & 150.0 & visa & 226.0 & $\\dots$ \\\\\n", + "\t5 & 3038871 & 0 & 1212802 & 50.0 & H & 1005 & 543.0 & 150.0 & mastercard & 117.0 & $\\dots$ \\\\\n", + "\t6 & 3234681 & 0 & 5883179 & 150.0 & R & 1006 & 399.0 & 150.0 & american express & 146.0 & $\\dots$ \\\\\n", + "\t7 & 3436647 & 0 & 11468973 & 100.0 & R & 1006 & 399.0 & 150.0 & american express & 146.0 & $\\dots$ \\\\\n", + "\t8 & 3095681 & 0 & 2145214 & 150.0 & R & 1006 & 399.0 & 150.0 & american express & 146.0 & $\\dots$ \\\\\n", + "\t9 & 3021401 & 0 & 850730 & 23.203 & C & 1007 & 555.0 & 135.0 & mastercard & 224.0 & $\\dots$ \\\\\n", + "\t10 & 3226241 & 0 & 5651177 & 55.164 & C & 1007 & 555.0 & 135.0 & mastercard & 224.0 & $\\dots$ \\\\\n", + "\t11 & 3039439 & 0 & 1222657 & 200.0 & R & 1009 & 399.0 & 150.0 & american express & 146.0 & $\\dots$ \\\\\n", + "\t12 & 3069943 & 0 & 1786013 & 50.0 & H & 1009 & 399.0 & 150.0 & american express & 146.0 & $\\dots$ \\\\\n", + "\t13 & 3054866 & 0 & 1539008 & 50.0 & H & 1009 & 399.0 & 150.0 & american express & 146.0 & $\\dots$ \\\\\n", + "\t14 & 3024078 & 0 & 926293 & 150.0 & R & 1009 & 399.0 & 150.0 & american express & 146.0 & $\\dots$ \\\\\n", + "\t15 & 2999616 & 0 & 354228 & 200.0 & R & 1009 & 399.0 & 150.0 & american express & 146.0 & $\\dots$ \\\\\n", + "\t16 & 3378279 & 0 & 9826142 & 277.932 & C & 1010 & 555.0 & 121.0 & visa & 226.0 & $\\dots$ \\\\\n", + "\t17 & 3319221 & 0 & 8183988 & 25.0 & H & 1011 & 543.0 & 150.0 & mastercard & 224.0 & $\\dots$ \\\\\n", + "\t18 & 3015198 & 0 & 702331 & 50.0 & H & 1012 & 479.0 & 150.0 & visa & 162.0 & $\\dots$ \\\\\n", + "\t19 & 3008268 & 0 & 562205 & 100.0 & H & 1012 & 479.0 & 150.0 & visa & 162.0 & $\\dots$ \\\\\n", + "\t20 & 3524124 & 0 & 14154216 & 75.0 & H & 1012 & 479.0 & 150.0 & visa & 162.0 & $\\dots$ \\\\\n", + "\t21 & 3098185 & 0 & 2166984 & 175.0 & R & 1012 & 479.0 & 150.0 & visa & 162.0 & $\\dots$ \\\\\n", + "\t22 & 3569931 & 0 & 15573121 & 19.92 & C & 1014 & 555.0 & 117.0 & visa & 226.0 & $\\dots$ \\\\\n", + "\t23 & 3563146 & 0 & 15300736 & 43.651 & C & 1014 & 555.0 & 117.0 & visa & 226.0 & $\\dots$ \\\\\n", + "\t24 & 3455404 & 1 & 12076280 & 450.0 & R & 1015 & 555.0 & 144.0 & mastercard & 224.0 & $\\dots$ \\\\\n", + "\t25 & 3393951 & 0 & 10275141 & 23.564 & C & 1015 & 555.0 & 144.0 & mastercard & 224.0 & $\\dots$ \\\\\n", + "\t26 & 3068829 & 0 & 1751791 & 63.6 & C & 1015 & 555.0 & 144.0 & mastercard & 224.0 & $\\dots$ \\\\\n", + "\t27 & 3080080 & 0 & 1919379 & 108.205 & C & 1015 & 555.0 & 144.0 & mastercard & 224.0 & $\\dots$ \\\\\n", + "\t28 & 3093683 & 0 & 2128731 & 31.132 & C & 1015 & 555.0 & 144.0 & mastercard & 224.0 & $\\dots$ \\\\\n", + "\t29 & 3159857 & 0 & 3734666 & 22.037 & C & 1015 & 555.0 & 144.0 & mastercard & 224.0 & $\\dots$ \\\\\n", + "\t30 & 3331629 & 0 & 8498739 & 22.96 & C & 1015 & 555.0 & 144.0 & mastercard & 224.0 & $\\dots$ \\\\\n", + "\t$\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & \\\\\n", + "\\end{tabular}\n" + ], + "text/plain": [ + "144233×434 DataFrame. Omitted printing of 426 columns\n", + "│ Row │ TransactionID │ isFraud │ TransactionDT │ TransactionAmt │ ProductCD │ card1 │ card2 │ card3 │\n", + "│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mFloat64?\u001b[39m │ \u001b[90mFloat64?\u001b[39m │\n", + "├────────┼───────────────┼─────────┼───────────────┼────────────────┼───────────┼───────┼──────────┼──────────┤\n", + "│ 1 │ 3230924 │ 0 │ 5787419 │ 23.443 │ C │ 1000 │ 555.0 │ 185.0 │\n", + "│ 2 │ 3020767 │ 0 │ 842821 │ 150.0 │ R │ 1004 │ 583.0 │ 150.0 │\n", + "│ 3 │ 3028973 │ 0 │ 1022173 │ 30.0 │ H │ 1004 │ 583.0 │ 150.0 │\n", + "│ 4 │ 3386444 │ 0 │ 10082484 │ 50.0 │ H │ 1004 │ 583.0 │ 150.0 │\n", + "│ 5 │ 3038871 │ 0 │ 1212802 │ 50.0 │ H │ 1005 │ 543.0 │ 150.0 │\n", + "│ 6 │ 3234681 │ 0 │ 5883179 │ 150.0 │ R │ 1006 │ 399.0 │ 150.0 │\n", + "│ 7 │ 3436647 │ 0 │ 11468973 │ 100.0 │ R │ 1006 │ 399.0 │ 150.0 │\n", + "│ 8 │ 3095681 │ 0 │ 2145214 │ 150.0 │ R │ 1006 │ 399.0 │ 150.0 │\n", + "│ 9 │ 3021401 │ 0 │ 850730 │ 23.203 │ C │ 1007 │ 555.0 │ 135.0 │\n", + "│ 10 │ 3226241 │ 0 │ 5651177 │ 55.164 │ C │ 1007 │ 555.0 │ 135.0 │\n", + "⋮\n", + "│ 144223 │ 3449153 │ 0 │ 11892549 │ 4.795 │ C │ 18377 │ 555.0 │ 131.0 │\n", + "│ 144224 │ 3453522 │ 0 │ 12008182 │ 24.715 │ C │ 18377 │ 555.0 │ 131.0 │\n", + "│ 144225 │ 3464285 │ 0 │ 12352490 │ 3.873 │ C │ 18377 │ 555.0 │ 131.0 │\n", + "│ 144226 │ 3464986 │ 0 │ 12401894 │ 13.034 │ C │ 18377 │ 555.0 │ 131.0 │\n", + "│ 144227 │ 3469775 │ 0 │ 12544573 │ 45.372 │ C │ 18377 │ 555.0 │ 131.0 │\n", + "│ 144228 │ 3472353 │ 0 │ 12628755 │ 67.383 │ C │ 18377 │ 555.0 │ 131.0 │\n", + "│ 144229 │ 3140430 │ 0 │ 3178997 │ 100.0 │ H │ 18378 │ 111.0 │ 150.0 │\n", + "│ 144230 │ 3077370 │ 0 │ 1887400 │ 150.0 │ R │ 18383 │ 128.0 │ 150.0 │\n", + "│ 144231 │ 3304239 │ 0 │ 7910291 │ 25.0 │ S │ 18384 │ 543.0 │ 150.0 │\n", + "│ 144232 │ 3058006 │ 0 │ 1604565 │ 100.0 │ H │ 18388 │ 555.0 │ 150.0 │\n", + "│ 144233 │ 3102181 │ 0 │ 2231501 │ 100.0 │ R │ 18396 │ 111.0 │ 150.0 │" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sort!(dff, [\"card1\",\"addr1\",\"D9\"])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Julia 1.4.1", + "language": "julia", + "name": "julia-1.4" + }, + "language_info": { + "file_extension": ".jl", + "mimetype": "application/julia", + "name": "julia", + "version": "1.4.1" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Julia/CSV/Read_CSV.ipynb b/Julia/CSV/Read_CSV.ipynb index 06e27f8..4b60d4a 100644 --- a/Julia/CSV/Read_CSV.ipynb +++ b/Julia/CSV/Read_CSV.ipynb @@ -5,6 +5,9 @@ "metadata": {}, "source": [ "## Reading a CSV file to a DataFrame in Julia (programing lang)\n", + "\n", + "Article: https://towardsdatascience.com/read-csv-to-data-frame-in-julia-programming-lang-77f3d0081c14\n", + "\n", "Julia often offer several ways how to do the same thing and reading CSV is an example. In all cases, you will need the `CSV` and `DataFrames` package. If you don't have them installed, in the Julia REPL run: `import Pkg; Pkg.add(\"CSV\"); Pkg.add(\"DataFrames\")`" ] },