diff --git a/DataFrames/ALL.ipynb b/DataFrames/ALL.ipynb
new file mode 100644
index 0000000..95b169b
--- /dev/null
+++ b/DataFrames/ALL.ipynb
@@ -0,0 +1,1258 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T23:14:45.083313Z",
+ "start_time": "2020-11-17T23:14:44.818195Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "UserWarning: The Dask Engine for Modin is experimental.\n"
+ ]
+ }
+ ],
+ "source": [
+ "import os\n",
+ "import sys\n",
+ "import gc\n",
+ "from time import time, sleep\n",
+ "\n",
+ "import pandas as pd\n",
+ "import dask.dataframe as dd\n",
+ "import modin.pandas as mpd\n",
+ "import vaex\n",
+ "from pyspark.sql import SparkSession\n",
+ "from pyspark.sql.functions import sum, avg\n",
+ "# pandas on ray has moved to Modin\n",
+ "# import ray.dataframe as rpd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T22:55:27.722264Z",
+ "start_time": "2020-11-17T22:55:27.699399Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# data based on https://www.kaggle.com/c/ieee-fraud-detection/data\n",
+ "folder = \"/home/vaclav/Data/Kaggle/EEE-CIS_Fraud_Detection\"\n",
+ "files = [\"train_transaction.csv\", \"train_identity.csv\"]\n",
+ "paths = [os.path.join(folder, f) for f in files]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T22:55:27.733967Z",
+ "start_time": "2020-11-17T22:55:27.727006Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "stats = {}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Pandas"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T22:55:27.750077Z",
+ "start_time": "2020-11-17T22:55:27.737957Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'1.1.4'"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.__version__"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T22:56:19.465362Z",
+ "start_time": "2020-11-17T22:55:27.767439Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "stats[\"pandas\"] = {}\n",
+ "s = stats[\"pandas\"]\n",
+ "\n",
+ "ts = time()\n",
+ "df = pd.read_csv(paths[0])\n",
+ "te = time()\n",
+ "s[\"load_transactions\"] = te-ts\n",
+ "\n",
+ "ts = time()\n",
+ "df2 = pd.read_csv(paths[1])\n",
+ "te = time()\n",
+ "s[\"load_identity\"] = te-ts\n",
+ "\n",
+ "ts = time()\n",
+ "dff = df.merge(df2, on=\"TransactionID\")\n",
+ "te = time()\n",
+ "s[\"merge\"] = te-ts\n",
+ "\n",
+ "ts = time()\n",
+ "grp = dff[\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\",\"TransactionAmt\"].fillna(\"\")\n",
+ ".groupby([\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\"])[\"TransactionAmt\"].agg([\"mean\",\"sum\"])\n",
+ "te = time()\n",
+ "s[\"aggregation\"] = te-ts\n",
+ "\n",
+ "ts = time()\n",
+ "dff.sort_values(by=[\"card1\",\"addr1\",\"D9\"], inplace=True)\n",
+ "dff.sort_values(by=[\"addr1\",\"D9\",\"card1\"], inplace=True)\n",
+ "dff.sort_values(by=[\"D9\",\"card1\",\"addr1\"], inplace=True)\n",
+ "te = time()\n",
+ "s[\"sorting\"] = te-ts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T22:56:19.519391Z",
+ "start_time": "2020-11-17T22:56:19.472412Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pandas | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " aggregation | \n",
+ " 0.075788 | \n",
+ "
\n",
+ " \n",
+ " load_identity | \n",
+ " 0.682109 | \n",
+ "
\n",
+ " \n",
+ " load_transactions | \n",
+ " 18.279765 | \n",
+ "
\n",
+ " \n",
+ " merge | \n",
+ " 3.196074 | \n",
+ "
\n",
+ " \n",
+ " sorting | \n",
+ " 2.224164 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pandas\n",
+ "aggregation 0.075788\n",
+ "load_identity 0.682109\n",
+ "load_transactions 18.279765\n",
+ "merge 3.196074\n",
+ "sorting 2.224164"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.DataFrame(stats)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dff.to_pickle(\"data/dff.pkl\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(4553, 2)"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Because julia groups by including N\\A, let's just check that number of groups matches\n",
+ "grp = dff[[\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\",\"TransactionAmt\"]].fillna(\"~U~\")\\\n",
+ ".groupby([\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\"])[\"TransactionAmt\"].agg([\"mean\",\"sum\"])\n",
+ "grp.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T22:56:19.550559Z",
+ "start_time": "2020-11-17T22:56:19.524963Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def clean(wait_time: int=15):\n",
+ " \"\"\"Cleans created DataFrames and call the garbage collector to actions. Wait for 15s by default\"\"\"\n",
+ " df, df2, dff, grp = None, None, None, None\n",
+ " gc.collect()\n",
+ " sleep(wait_time)\n",
+ " return None"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T22:56:34.718818Z",
+ "start_time": "2020-11-17T22:56:19.559830Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "clean()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T22:56:34.813222Z",
+ "start_time": "2020-11-17T22:56:34.780299Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def list_variables_memory_usage() -> dict:\n",
+ " \"\"\"Memory of existing local variables\"\"\"\n",
+ " local_vars = list(locals().items())\n",
+ " return {var: sys.getsizeof(obj) for var, obj in local_vars}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Dask\n",
+ "When to use dask - https://docs.dask.org/en/latest/dataframe.html#common-uses-and-anti-uses"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T22:57:00.280745Z",
+ "start_time": "2020-11-17T22:56:34.844985Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "stats[\"dask\"] = {}\n",
+ "s = stats[\"dask\"]\n",
+ "\n",
+ "ts = time()\n",
+ "df = dd.read_csv(paths[0])\n",
+ "te = time()\n",
+ "s[\"load_transactions\"] = te-ts\n",
+ "\n",
+ "ts = time()\n",
+ "df2 = dd.read_csv(paths[1])\n",
+ "te = time()\n",
+ "s[\"load_identity\"] = te-ts\n",
+ "\n",
+ "ts = time()\n",
+ "dff = df.merge(df2, on=\"TransactionID\")\n",
+ "te = time()\n",
+ "s[\"merge\"] = te-ts\n",
+ "\n",
+ "# the difference is that we call compute method, which runs all the computations at this point\n",
+ "ts = time()\n",
+ "grp = dff.groupby([\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\"])[\"TransactionAmt\"]\\\n",
+ " .agg([\"mean\",\"sum\"])\\\n",
+ " .compute()\n",
+ "te = time()\n",
+ "s[\"aggregation\"] = te-ts\n",
+ "\n",
+ "# parallel soring is tricky that is why there are only work arounds in dask. \n",
+ "ts = time()\n",
+ "dff.set_index(\"card1\").compute()\n",
+ "te = time()\n",
+ "s[\"sorting\"] = te-ts\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T22:57:15.539155Z",
+ "start_time": "2020-11-17T22:57:00.286799Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "clean()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T22:57:15.580380Z",
+ "start_time": "2020-11-17T22:57:15.546567Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pandas | \n",
+ " dask | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " load_transactions | \n",
+ " 18.279765 | \n",
+ " 0.083901 | \n",
+ "
\n",
+ " \n",
+ " load_identity | \n",
+ " 0.682109 | \n",
+ " 0.028268 | \n",
+ "
\n",
+ " \n",
+ " merge | \n",
+ " 3.196074 | \n",
+ " 0.073891 | \n",
+ "
\n",
+ " \n",
+ " aggregation | \n",
+ " 0.075788 | \n",
+ " 20.837958 | \n",
+ "
\n",
+ " \n",
+ " sorting | \n",
+ " 2.224164 | \n",
+ " 71.282675 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pandas dask\n",
+ "load_transactions 18.279765 0.083901\n",
+ "load_identity 0.682109 0.028268\n",
+ "merge 3.196074 0.073891\n",
+ "aggregation 0.075788 20.837958\n",
+ "sorting 2.224164 71.282675"
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.DataFrame(stats)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "stats[\"dask_indexed\"] = {}\n",
+ "s = stats[\"dask_indexed\"]\n",
+ "\n",
+ "ts = time()\n",
+ "df = dd.read_csv(paths[0]).set_index(\"TransactionID\")\n",
+ "te = time()\n",
+ "s[\"load_transactions\"] = te-ts\n",
+ "\n",
+ "ts = time()\n",
+ "df2 = dd.read_csv(paths[1]).set_index(\"TransactionID\")\n",
+ "te = time()\n",
+ "s[\"load_identity\"] = te-ts\n",
+ "\n",
+ "ts = time()\n",
+ "dff = df.merge(df2, left_index=True, right_index=True)\n",
+ "te = time()\n",
+ "s[\"merge\"] = te-ts\n",
+ "\n",
+ "# the difference is that we call compute method, which runs all the computations at this point\n",
+ "ts = time()\n",
+ "grp = dff.groupby([\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\"])[\"TransactionAmt\"]\\\n",
+ " .agg([\"mean\",\"sum\"])\\\n",
+ " .compute()\n",
+ "te = time()\n",
+ "s[\"aggregation\"] = te-ts\n",
+ "\n",
+ "# parallel soring is tricky that is why there are only work arounds in dask. \n",
+ "ts = time()\n",
+ "dff.set_index(\"card1\").compute()\n",
+ "te = time()\n",
+ "s[\"sorting\"] = te-ts\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pandas | \n",
+ " dask | \n",
+ " dask_indexed | \n",
+ " vaex | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " load_transactions | \n",
+ " 18.279765 | \n",
+ " 0.083901 | \n",
+ " 14.930128 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " load_identity | \n",
+ " 0.682109 | \n",
+ " 0.028268 | \n",
+ " 0.761821 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " merge | \n",
+ " 3.196074 | \n",
+ " 0.073891 | \n",
+ " 0.078762 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " aggregation | \n",
+ " 0.075788 | \n",
+ " 20.837958 | \n",
+ " 23.130105 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " sorting | \n",
+ " 2.224164 | \n",
+ " 71.282675 | \n",
+ " 75.393628 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pandas dask dask_indexed vaex\n",
+ "load_transactions 18.279765 0.083901 14.930128 NaN\n",
+ "load_identity 0.682109 0.028268 0.761821 NaN\n",
+ "merge 3.196074 0.073891 0.078762 NaN\n",
+ "aggregation 0.075788 20.837958 23.130105 NaN\n",
+ "sorting 2.224164 71.282675 75.393628 NaN"
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "clean()\n",
+ "pd.DataFrame(stats)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T22:29:19.438715Z",
+ "start_time": "2020-11-17T22:29:19.429209Z"
+ }
+ },
+ "source": [
+ "# Vaex"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 97,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T23:03:06.560013Z",
+ "start_time": "2020-11-17T23:03:06.545427Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'vaex-core': '2.0.3',\n",
+ " 'vaex-viz': '0.4.0',\n",
+ " 'vaex-hdf5': '0.6.0',\n",
+ " 'vaex-server': '0.3.1',\n",
+ " 'vaex-astro': '0.7.0',\n",
+ " 'vaex-jupyter': '0.5.2',\n",
+ " 'vaex-ml': '0.9.0',\n",
+ " 'vaex-arrow': '0.5.1'}"
+ ]
+ },
+ "execution_count": 97,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "vaex.__version__"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 98,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T23:04:01.165275Z",
+ "start_time": "2020-11-17T23:03:06.562006Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "tool = \"vaex\"\n",
+ "stats[tool] = {}\n",
+ "s = stats[tool]\n",
+ "\n",
+ "\n",
+ "ts = time()\n",
+ "df = vaex.open(paths[0])\n",
+ "te = time()\n",
+ "s[\"load_transactions\"] = te-ts\n",
+ "\n",
+ "ts = time()\n",
+ "df2 = vaex.open(paths[1])\n",
+ "te = time()\n",
+ "s[\"load_identity\"] = te-ts\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 99,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T23:04:01.642707Z",
+ "start_time": "2020-11-17T23:04:01.176085Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "ts = time()\n",
+ "dff = df.join(df2, on=\"TransactionID\")\n",
+ "te = time()\n",
+ "s[\"merge\"] = te-ts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 100,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T23:04:03.395316Z",
+ "start_time": "2020-11-17T23:04:01.645742Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# the difference is that we call compute method, which runs all the computations at this point\n",
+ "ts = time()\n",
+ "grp = dff.groupby([dff[\"isFraud\"],dff[\"ProductCD\"],dff[\"card4\"],dff[\"card6\"],dff[\"id_15\"],dff[\"id_31\"]], \n",
+ " agg=[vaex.agg.mean('TransactionAmt'), vaex.agg.sum('TransactionAmt')])\n",
+ "te = time()\n",
+ "s[\"aggregation\"] = te-ts\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 111,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# the difference is that we call compute method, which runs all the computations at this point\n",
+ "ts = time()\n",
+ "dff_s = dff.sort(by=[\"card1\",\"addr1\",\"D9\"])\n",
+ "dff_s = dff.sort(by=[\"addr1\",\"D9\",\"card1\"])\n",
+ "dff_s = dff.sort(by=[\"D9\",\"card1\",\"addr1\"])\n",
+ "te = time()\n",
+ "s[\"sorting\"] = te-ts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 113,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T23:04:03.469428Z",
+ "start_time": "2020-11-17T23:04:03.423857Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pandas | \n",
+ " dask | \n",
+ " dask_indexed | \n",
+ " vaex | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " load_transactions | \n",
+ " 18.279765 | \n",
+ " 0.083901 | \n",
+ " 14.930128 | \n",
+ " 18.734002 | \n",
+ "
\n",
+ " \n",
+ " load_identity | \n",
+ " 0.682109 | \n",
+ " 0.028268 | \n",
+ " 0.761821 | \n",
+ " 1.023915 | \n",
+ "
\n",
+ " \n",
+ " merge | \n",
+ " 3.196074 | \n",
+ " 0.073891 | \n",
+ " 0.078762 | \n",
+ " 0.131490 | \n",
+ "
\n",
+ " \n",
+ " aggregation | \n",
+ " 0.075788 | \n",
+ " 20.837958 | \n",
+ " 23.130105 | \n",
+ " 0.383996 | \n",
+ "
\n",
+ " \n",
+ " sorting | \n",
+ " 2.224164 | \n",
+ " 71.282675 | \n",
+ " 75.393628 | \n",
+ " 1.035000 | \n",
+ "
\n",
+ " \n",
+ " sort | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.329828 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pandas dask dask_indexed vaex\n",
+ "load_transactions 18.279765 0.083901 14.930128 18.734002\n",
+ "load_identity 0.682109 0.028268 0.761821 1.023915\n",
+ "merge 3.196074 0.073891 0.078762 0.131490\n",
+ "aggregation 0.075788 20.837958 23.130105 0.383996\n",
+ "sorting 2.224164 71.282675 75.393628 1.035000\n",
+ "sort NaN NaN NaN 0.329828"
+ ]
+ },
+ "execution_count": 113,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.DataFrame(stats)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T23:04:18.930053Z",
+ "start_time": "2020-11-17T23:04:03.543914Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "clean()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# PySpark"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T23:04:27.010489Z",
+ "start_time": "2020-11-17T23:04:18.932048Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from pyspark import SparkContext\n",
+ "sc = SparkContext()\n",
+ "sc.version\n",
+ "sc.stop()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T23:04:29.416261Z",
+ "start_time": "2020-11-17T23:04:27.011485Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Create my_spark\n",
+ "my_spark = SparkSession.builder \\\n",
+ " .master(\"local\") \\\n",
+ " .appName(\"Pandas Alternative\") \\\n",
+ " .config(\"spark.some.config.option\", \"some-value\") \\\n",
+ " .getOrCreate()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T23:05:33.528531Z",
+ "start_time": "2020-11-17T23:04:29.419253Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "tool = \"spark\"\n",
+ "stats[tool] = {}\n",
+ "s = stats[tool]\n",
+ "\n",
+ "\n",
+ "ts = time()\n",
+ "df = my_spark.read.csv(paths[0],inferSchema = True,header= True) \n",
+ "te = time()\n",
+ "s[\"load_transactions\"] = te-ts\n",
+ "\n",
+ "ts = time()\n",
+ "df2 = my_spark.read.csv(paths[1],inferSchema = True,header= True) \n",
+ "te = time()\n",
+ "s[\"load_identity\"] = te-ts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T23:05:33.849489Z",
+ "start_time": "2020-11-17T23:05:33.534687Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "\n",
+ "ts = time()\n",
+ "dff = df.join(df2, \"TransactionID\")\n",
+ "te = time()\n",
+ "s[\"merge\"] = te-ts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T23:06:01.148952Z",
+ "start_time": "2020-11-17T23:05:33.851490Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# the difference is that we call collect method, which runs all the computations at this point\n",
+ "#ts = time()\n",
+ "#grp = dff.groupby([dff[\"isFraud\"],dff[\"ProductCD\"],dff[\"card4\"],dff[\"card6\"],dff[\"id_15\"],dff[\"id_31\"]]) \\\n",
+ "# .agg(avg(\"TransactionAmt\"), sum(\"TransactionAmt\"))\\\n",
+ "# .collect()\n",
+ "#te = time()\n",
+ "#s[\"aggregation\"] = te-ts\n",
+ "#s[\"all\"] = te-tss"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T23:13:24.332254Z",
+ "start_time": "2020-11-17T23:13:03.641149Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# the difference is that we call collect method, which runs all the computations at this point\n",
+ "ts = time()\n",
+ "grp = dff.groupby([\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\"]) \\\n",
+ " .agg(avg(\"TransactionAmt\"), sum(\"TransactionAmt\"))\\\n",
+ " .collect()\n",
+ "te = time()\n",
+ "s[\"aggregation\"] = te-ts\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T23:06:25.157340Z",
+ "start_time": "2020-11-17T23:06:25.118349Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pandas | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " aggregation | \n",
+ " 0.060114 | \n",
+ "
\n",
+ " \n",
+ " all | \n",
+ " 19.908346 | \n",
+ "
\n",
+ " \n",
+ " load_identity | \n",
+ " 0.480164 | \n",
+ "
\n",
+ " \n",
+ " load_transactions | \n",
+ " 17.354527 | \n",
+ "
\n",
+ " \n",
+ " merge | \n",
+ " 2.013150 | \n",
+ "
\n",
+ " \n",
+ " Total | \n",
+ " 39.816302 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pandas\n",
+ "aggregation 0.060114\n",
+ "all 19.908346\n",
+ "load_identity 0.480164\n",
+ "load_transactions 17.354527\n",
+ "merge 2.013150\n",
+ "Total 39.816302"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "stats_df = pd.DataFrame(stats)\n",
+ "stats_df.loc['Total'] = stats_df.sum()\n",
+ "stats_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T22:16:19.569545Z",
+ "start_time": "2020-11-17T22:16:19.559625Z"
+ }
+ },
+ "source": [
+ "# Modin"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T22:57:15.609009Z",
+ "start_time": "2020-11-17T22:57:15.586070Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'0.8.2'"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mpd.__version__"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T23:00:06.292260Z",
+ "start_time": "2020-11-17T22:58:42.702035Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n",
+ "distributed.nanny - WARNING - Restarting worker\n",
+ "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n",
+ "distributed.nanny - WARNING - Restarting worker\n",
+ "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n",
+ "distributed.nanny - WARNING - Restarting worker\n",
+ "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n",
+ "distributed.nanny - WARNING - Restarting worker\n",
+ "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n",
+ "distributed.nanny - WARNING - Restarting worker\n",
+ "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n"
+ ]
+ },
+ {
+ "ename": "KilledWorker",
+ "evalue": "('lambda-dc847cac2df298f0ded2b3e426e3824d', )",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[1;31mKilledWorker\u001b[0m Traceback (most recent call last)",
+ "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[0mts\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 7\u001b[1;33m \u001b[0mdf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpaths\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 8\u001b[0m \u001b[0mte\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[0ms\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"load_transactions\"\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mte\u001b[0m\u001b[1;33m-\u001b[0m\u001b[0mts\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\modin\\pandas\\io.py\u001b[0m in \u001b[0;36mparser_func\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, skipfooter, doublequote, delim_whitespace, low_memory, memory_map, float_precision)\u001b[0m\n\u001b[0;32m 107\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"sep\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0msep\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 108\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"sep\"\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"\\t\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 109\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 110\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 111\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mparser_func\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\modin\\pandas\\io.py\u001b[0m in \u001b[0;36m_read\u001b[1;34m(**kwargs)\u001b[0m\n\u001b[0;32m 125\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mmodin\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdata_management\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfactories\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdispatcher\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mEngineDispatcher\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 126\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 127\u001b[1;33m \u001b[0mpd_obj\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mEngineDispatcher\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 128\u001b[0m \u001b[1;31m# This happens when `read_csv` returns a TextFileReader object for iterating through\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 129\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpd_obj\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpandas\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mio\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparsers\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mTextFileReader\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\modin\\data_management\\factories\\dispatcher.py\u001b[0m in \u001b[0;36mread_csv\u001b[1;34m(cls, **kwargs)\u001b[0m\n\u001b[0;32m 102\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mclassmethod\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 103\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mread_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcls\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 104\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mcls\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__engine\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_read_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 105\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 106\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mclassmethod\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\modin\\data_management\\factories\\factories.py\u001b[0m in \u001b[0;36m_read_csv\u001b[1;34m(cls, **kwargs)\u001b[0m\n\u001b[0;32m 85\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mclassmethod\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 86\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_read_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcls\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 87\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mcls\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mio_cls\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 88\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 89\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mclassmethod\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\modin\\engines\\base\\io\\file_reader.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(cls, *args, **kwargs)\u001b[0m\n\u001b[0;32m 27\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0mclassmethod\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 28\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcls\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 29\u001b[1;33m \u001b[0mquery_compiler\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcls\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 30\u001b[0m \u001b[1;31m# TODO (devin-petersohn): Make this section more general for non-pandas kernel\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 31\u001b[0m \u001b[1;31m# implementations.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\modin\\engines\\base\\io\\text\\csv_reader.py\u001b[0m in \u001b[0;36m_read\u001b[1;34m(cls, filepath_or_buffer, **kwargs)\u001b[0m\n\u001b[0;32m 174\u001b[0m \u001b[1;31m# or based on the column(s) that were requested.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 175\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mindex_col\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 176\u001b[1;33m \u001b[0mrow_lengths\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcls\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmaterialize\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindex_ids\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 177\u001b[0m \u001b[0mnew_index\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpandas\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mRangeIndex\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrow_lengths\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 178\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\modin\\engines\\dask\\task_wrapper.py\u001b[0m in \u001b[0;36mmaterialize\u001b[1;34m(cls, future)\u001b[0m\n\u001b[0;32m 28\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mmaterialize\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcls\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfuture\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 29\u001b[0m \u001b[0mclient\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_get_global_client\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 30\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mclient\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgather\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfuture\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\distributed\\client.py\u001b[0m in \u001b[0;36mgather\u001b[1;34m(self, futures, errors, direct, asynchronous)\u001b[0m\n\u001b[0;32m 1990\u001b[0m \u001b[0mdirect\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdirect\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1991\u001b[0m \u001b[0mlocal_worker\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mlocal_worker\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1992\u001b[1;33m \u001b[0masynchronous\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0masynchronous\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1993\u001b[0m )\n\u001b[0;32m 1994\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\distributed\\client.py\u001b[0m in \u001b[0;36msync\u001b[1;34m(self, func, asynchronous, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[0;32m 831\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 832\u001b[0m return sync(\n\u001b[1;32m--> 833\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloop\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcallback_timeout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mcallback_timeout\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 834\u001b[0m )\n\u001b[0;32m 835\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\distributed\\utils.py\u001b[0m in \u001b[0;36msync\u001b[1;34m(loop, func, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[0;32m 338\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0merror\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 339\u001b[0m \u001b[0mtyp\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtb\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0merror\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 340\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 341\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 342\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\distributed\\utils.py\u001b[0m in \u001b[0;36mf\u001b[1;34m()\u001b[0m\n\u001b[0;32m 322\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mcallback_timeout\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 323\u001b[0m \u001b[0mfuture\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0masyncio\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwait_for\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfuture\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcallback_timeout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 324\u001b[1;33m \u001b[0mresult\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32myield\u001b[0m \u001b[0mfuture\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 325\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 326\u001b[0m \u001b[0merror\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\tornado\\gen.py\u001b[0m in \u001b[0;36mrun\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 733\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 734\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 735\u001b[1;33m \u001b[0mvalue\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfuture\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 736\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 737\u001b[0m \u001b[0mexc_info\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;32m~\\Anaconda3\\envs\\big_tables\\lib\\site-packages\\distributed\\client.py\u001b[0m in \u001b[0;36m_gather\u001b[1;34m(self, futures, errors, direct, local_worker)\u001b[0m\n\u001b[0;32m 1849\u001b[0m \u001b[0mexc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mCancelledError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1850\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1851\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mexception\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtraceback\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1852\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1853\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0merrors\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m\"skip\"\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+ "\u001b[1;31mKilledWorker\u001b[0m: ('lambda-dc847cac2df298f0ded2b3e426e3824d', )"
+ ]
+ }
+ ],
+ "source": [
+ "tool = \"modin\"\n",
+ "stats[tool] = {}\n",
+ "s = stats[tool]\n",
+ "\n",
+ "\n",
+ "ts = time()\n",
+ "df = mpd.read_csv(paths[0])\n",
+ "te = time()\n",
+ "s[\"load_transactions\"] = te-ts\n",
+ "\n",
+ "ts = time()\n",
+ "df2 = mpd.read_csv(paths[1])\n",
+ "te = time()\n",
+ "s[\"load_identity\"] = te-ts\n",
+ "\n",
+ "ts = time()\n",
+ "dff = df.merge(df2, on=\"TransactionID\")\n",
+ "te = time()\n",
+ "s[\"merge\"] = te-ts\n",
+ "\n",
+ "# modin defaults to pandas for multiple column aggregation and then fails on KeyError, though the key is available\n",
+ "ts = time()\n",
+ "try:\n",
+ " grp = dff.groupby([\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\"])[\"TransactionAmt\"].agg([\"mean\",\"sum\"])\n",
+ "except Exception as e:\n",
+ " print(e)\n",
+ "te = time()\n",
+ "s[\"aggregation\"] = te-ts\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T22:58:20.241434Z",
+ "start_time": "2020-11-17T22:55:17.025Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "pd.DataFrame(stats)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-17T23:01:01.211911Z",
+ "start_time": "2020-11-17T23:00:45.850513Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n",
+ "distributed.nanny - WARNING - Restarting worker\n",
+ "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n",
+ "distributed.nanny - WARNING - Restarting worker\n",
+ "distributed.nanny - WARNING - Worker exceeded 95% memory budget. Restarting\n",
+ "distributed.nanny - WARNING - Restarting worker\n"
+ ]
+ }
+ ],
+ "source": [
+ "clean()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "big-tables",
+ "language": "python",
+ "name": "big-tables"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.5"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {},
+ "toc_section_display": true,
+ "toc_window_display": false
+ },
+ "varInspector": {
+ "cols": {
+ "lenName": 16,
+ "lenType": 16,
+ "lenVar": 40
+ },
+ "kernels_config": {
+ "python": {
+ "delete_cmd_postfix": "",
+ "delete_cmd_prefix": "del ",
+ "library": "var_list.py",
+ "varRefreshCmd": "print(var_dic_list())"
+ },
+ "r": {
+ "delete_cmd_postfix": ") ",
+ "delete_cmd_prefix": "rm(",
+ "library": "var_list.r",
+ "varRefreshCmd": "cat(var_dic_list()) "
+ }
+ },
+ "types_to_exclude": [
+ "module",
+ "function",
+ "builtin_function_or_method",
+ "instance",
+ "_Feature"
+ ],
+ "window_display": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/DataFrames/Julia.ipynb b/DataFrames/Julia.ipynb
new file mode 100644
index 0000000..7d7eeb8
--- /dev/null
+++ b/DataFrames/Julia.ipynb
@@ -0,0 +1,593 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-15T23:25:27.615000+01:00",
+ "start_time": "2020-11-15T22:25:00.718Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "using CSV\n",
+ "using DataFrames\n",
+ "using Dates\n",
+ "using Statistics"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "v\"1.4.1\""
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Julia version\n",
+ "VERSION"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2020-11-15T23:25:31.536000+01:00",
+ "start_time": "2020-11-15T22:25:00.722Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2-element Array{String,1}:\n",
+ " \"dff.pkl\"\n",
+ " \"sales_data_sample.csv\""
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "readdir(\"data\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "2-element Array{String,1}:\n",
+ " \"train_transaction.csv\"\n",
+ " \"train_identity.csv\""
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "folder = \"/home/vaclav/Data/Kaggle/EEE-CIS_Fraud_Detection\"\n",
+ "files = [\"train_transaction.csv\", \"train_identity.csv\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "\"/home/vaclav/Data/Kaggle/EEE-CIS_Fraud_Detection/train_transaction.csv\""
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "joinpath(folder,files[1])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Dict{Any,Any} with 5 entries:\n",
+ " \"merge\" => 0.771\n",
+ " \"sort\" => 5.032\n",
+ " \"load_transactions\" => 8.045\n",
+ " \"aggregation\" => 0.034\n",
+ " \"load_identity\" => 0.502"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "s = Dict()\n",
+ "\n",
+ "# load transactions ~600MB\n",
+ "ts = now()\n",
+ "df = CSV.read(joinpath(folder,files[1]), DataFrame)\n",
+ "te = now()\n",
+ "time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)\n",
+ "push!(s, \"load_transactions\"=>time_in_sec)\n",
+ "\n",
+ "# load identity ~25MB\n",
+ "ts = now()\n",
+ "df2 = CSV.read(joinpath(folder,files[2]), DataFrame)\n",
+ "te = now()\n",
+ "time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)\n",
+ "push!(s, \"load_identity\"=>time_in_sec)\n",
+ "\n",
+ "# join\n",
+ "ts = now()\n",
+ "dff = join(df, df2, kind = :inner, on = \"TransactionID\")\n",
+ "te = now()\n",
+ "time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)\n",
+ "push!(s, \"merge\"=>time_in_sec)\n",
+ "\n",
+ "# group by\n",
+ "ts = now()\n",
+ "grp = combine(groupby(dff, [\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\"]), \n",
+ " :TransactionAmt=>maximum=>:TransactionAmountMax, \n",
+ " :TransactionAmt=>mean=>:TransactionAmountMean)\n",
+ "te = now()\n",
+ "time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)\n",
+ "push!(s, \"aggregation\"=>time_in_sec)\n",
+ "\n",
+ "# group by\n",
+ "ts = now()\n",
+ "sort!(dff, [\"card1\",\"addr1\",\"D9\"])\n",
+ "sort!(dff, [\"addr1\",\"D9\",\"card1\"])\n",
+ "sort!(dff, [\"D9\",\"card1\",\"addr1\"])\n",
+ "te = now()\n",
+ "time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)\n",
+ "push!(s, \"sort\"=>time_in_sec)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ " | aggregation | load_identity | load_transactions | merge | sort |
---|
| Float64 | Float64 | Float64 | Float64 | Float64 |
---|
1 rows × 5 columns
1 | 0.034 | 0.502 | 8.045 | 0.771 | 5.032 |
---|
"
+ ],
+ "text/latex": [
+ "\\begin{tabular}{r|ccccc}\n",
+ "\t& aggregation & load\\_identity & load\\_transactions & merge & sort\\\\\n",
+ "\t\\hline\n",
+ "\t& Float64 & Float64 & Float64 & Float64 & Float64\\\\\n",
+ "\t\\hline\n",
+ "\t1 & 0.034 & 0.502 & 8.045 & 0.771 & 5.032 \\\\\n",
+ "\\end{tabular}\n"
+ ],
+ "text/plain": [
+ "1×5 DataFrame\n",
+ "│ Row │ aggregation │ load_identity │ load_transactions │ merge │ sort │\n",
+ "│ │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n",
+ "├─────┼─────────────┼───────────────┼───────────────────┼─────────┼─────────┤\n",
+ "│ 1 │ 0.034 │ 0.502 │ 8.045 │ 0.771 │ 5.032 │"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "DataFrame(s)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(144233, 434, 4553, 8)"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# check the shape of the dataframes\n",
+ "nrow(dff), length(names(dff)), nrow(grp), length(names(grp))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Groupby Details\n",
+ "https://dataframes.juliadata.org/stable/man/split_apply_combine/"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "GroupedDataFrame with 4553 groups based on keys: isFraud, ProductCD, card4, card6, id_15, id_31
First Group (136 rows): isFraud = 0, ProductCD = \"H\", card4 = \"visa\", card6 = \"debit\", id_15 = \"Found\", id_31 = \"firefox 57.0\"
| TransactionID | isFraud | TransactionDT | TransactionAmt | ProductCD | card1 | card2 |
---|
| Int64 | Int64 | Int64 | Float64 | String | Int64 | Float64? |
---|
1 | 3067453 | 0 | 1729328 | 200.0 | H | 1030 | 157.0 |
---|
2 | 3073296 | 0 | 1816710 | 100.0 | H | 1675 | 174.0 |
---|
3 | 3061872 | 0 | 1642116 | 100.0 | H | 1974 | 111.0 |
---|
4 | 3078930 | 0 | 1902540 | 100.0 | H | 6697 | 111.0 |
---|
5 | 3038788 | 0 | 1211592 | 75.0 | H | 7508 | 321.0 |
---|
6 | 3056336 | 0 | 1556463 | 25.0 | H | 9500 | 321.0 |
---|
7 | 3026050 | 0 | 951882 | 50.0 | H | 9500 | 321.0 |
---|
8 | 2999258 | 0 | 348274 | 50.0 | H | 10680 | 373.0 |
---|
9 | 2995902 | 0 | 260116 | 40.0 | H | 12526 | 381.0 |
---|
10 | 3124444 | 0 | 2767769 | 50.0 | H | 12839 | 321.0 |
---|
11 | 3091878 | 0 | 2077478 | 40.0 | H | 2884 | 490.0 |
---|
12 | 3011076 | 0 | 608878 | 50.0 | H | 7508 | 321.0 |
---|
13 | 3170944 | 0 | 4067304 | 100.0 | H | 7664 | 490.0 |
---|
14 | 3170950 | 0 | 4067403 | 100.0 | H | 7664 | 490.0 |
---|
15 | 3170951 | 0 | 4067438 | 100.0 | H | 7664 | 490.0 |
---|
16 | 3145133 | 0 | 3289750 | 150.0 | H | 9112 | 250.0 |
---|
17 | 3034874 | 0 | 1130093 | 300.0 | H | 10294 | 555.0 |
---|
18 | 3170845 | 0 | 4065306 | 35.0 | H | 12695 | 490.0 |
---|
19 | 3114169 | 0 | 2516387 | 30.0 | H | 5822 | 555.0 |
---|
20 | 3264648 | 0 | 6746645 | 40.0 | H | 15497 | 490.0 |
---|
21 | 3043704 | 0 | 1304889 | 25.0 | H | 16993 | 555.0 |
---|
22 | 3191704 | 0 | 4678752 | 25.0 | H | 1323 | 268.0 |
---|
23 | 3092620 | 0 | 2087380 | 100.0 | H | 2772 | 512.0 |
---|
24 | 3023170 | 0 | 876581 | 50.0 | H | 17188 | 321.0 |
---|
25 | 3062921 | 0 | 1654005 | 50.0 | H | 17496 | 554.0 |
---|
26 | 3098962 | 0 | 2176776 | 75.0 | H | 12501 | 490.0 |
---|
27 | 3099033 | 0 | 2177727 | 75.0 | H | 12501 | 490.0 |
---|
28 | 3039596 | 0 | 1226005 | 50.0 | H | 14084 | 257.0 |
---|
29 | 3057383 | 0 | 1570940 | 25.0 | H | 15377 | 555.0 |
---|
30 | 3044025 | 0 | 1311408 | 25.0 | H | 15497 | 490.0 |
---|
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
---|
⋮
Last Group (1 row): isFraud = 1, ProductCD = \"S\", card4 = \"visa\", card6 = \"credit\", id_15 = \"New\", id_31 = \"mobile safari 11.0\"
| TransactionID | isFraud | TransactionDT | TransactionAmt | ProductCD | card1 | card2 |
---|
| Int64 | Int64 | Int64 | Float64 | String | Int64 | Float64? |
---|
1 | 3216693 | 1 | 5439563 | 25.0 | S | 18375 | 174.0 |
---|
"
+ ],
+ "text/latex": [
+ "GroupedDataFrame with 4553 groups based on keys: isFraud, ProductCD, card4, card6, id\\_15, id\\_31\n",
+ "\n",
+ "First Group (136 rows): isFraud = 0, ProductCD = \"H\", card4 = \"visa\", card6 = \"debit\", id\\_15 = \"Found\", id\\_31 = \"firefox 57.0\"\n",
+ "\n",
+ "\\begin{tabular}{r|cccccccc}\n",
+ "\t& TransactionID & isFraud & TransactionDT & TransactionAmt & ProductCD & card1 & card2 & \\\\\n",
+ "\t\\hline\n",
+ "\t& Int64 & Int64 & Int64 & Float64 & String & Int64 & Float64? & \\\\\n",
+ "\t\\hline\n",
+ "\t1 & 3067453 & 0 & 1729328 & 200.0 & H & 1030 & 157.0 & $\\dots$ \\\\\n",
+ "\t2 & 3073296 & 0 & 1816710 & 100.0 & H & 1675 & 174.0 & $\\dots$ \\\\\n",
+ "\t3 & 3061872 & 0 & 1642116 & 100.0 & H & 1974 & 111.0 & $\\dots$ \\\\\n",
+ "\t4 & 3078930 & 0 & 1902540 & 100.0 & H & 6697 & 111.0 & $\\dots$ \\\\\n",
+ "\t5 & 3038788 & 0 & 1211592 & 75.0 & H & 7508 & 321.0 & $\\dots$ \\\\\n",
+ "\t6 & 3056336 & 0 & 1556463 & 25.0 & H & 9500 & 321.0 & $\\dots$ \\\\\n",
+ "\t7 & 3026050 & 0 & 951882 & 50.0 & H & 9500 & 321.0 & $\\dots$ \\\\\n",
+ "\t8 & 2999258 & 0 & 348274 & 50.0 & H & 10680 & 373.0 & $\\dots$ \\\\\n",
+ "\t9 & 2995902 & 0 & 260116 & 40.0 & H & 12526 & 381.0 & $\\dots$ \\\\\n",
+ "\t10 & 3124444 & 0 & 2767769 & 50.0 & H & 12839 & 321.0 & $\\dots$ \\\\\n",
+ "\t11 & 3091878 & 0 & 2077478 & 40.0 & H & 2884 & 490.0 & $\\dots$ \\\\\n",
+ "\t12 & 3011076 & 0 & 608878 & 50.0 & H & 7508 & 321.0 & $\\dots$ \\\\\n",
+ "\t13 & 3170944 & 0 & 4067304 & 100.0 & H & 7664 & 490.0 & $\\dots$ \\\\\n",
+ "\t14 & 3170950 & 0 & 4067403 & 100.0 & H & 7664 & 490.0 & $\\dots$ \\\\\n",
+ "\t15 & 3170951 & 0 & 4067438 & 100.0 & H & 7664 & 490.0 & $\\dots$ \\\\\n",
+ "\t16 & 3145133 & 0 & 3289750 & 150.0 & H & 9112 & 250.0 & $\\dots$ \\\\\n",
+ "\t17 & 3034874 & 0 & 1130093 & 300.0 & H & 10294 & 555.0 & $\\dots$ \\\\\n",
+ "\t18 & 3170845 & 0 & 4065306 & 35.0 & H & 12695 & 490.0 & $\\dots$ \\\\\n",
+ "\t19 & 3114169 & 0 & 2516387 & 30.0 & H & 5822 & 555.0 & $\\dots$ \\\\\n",
+ "\t20 & 3264648 & 0 & 6746645 & 40.0 & H & 15497 & 490.0 & $\\dots$ \\\\\n",
+ "\t21 & 3043704 & 0 & 1304889 & 25.0 & H & 16993 & 555.0 & $\\dots$ \\\\\n",
+ "\t22 & 3191704 & 0 & 4678752 & 25.0 & H & 1323 & 268.0 & $\\dots$ \\\\\n",
+ "\t23 & 3092620 & 0 & 2087380 & 100.0 & H & 2772 & 512.0 & $\\dots$ \\\\\n",
+ "\t24 & 3023170 & 0 & 876581 & 50.0 & H & 17188 & 321.0 & $\\dots$ \\\\\n",
+ "\t25 & 3062921 & 0 & 1654005 & 50.0 & H & 17496 & 554.0 & $\\dots$ \\\\\n",
+ "\t26 & 3098962 & 0 & 2176776 & 75.0 & H & 12501 & 490.0 & $\\dots$ \\\\\n",
+ "\t27 & 3099033 & 0 & 2177727 & 75.0 & H & 12501 & 490.0 & $\\dots$ \\\\\n",
+ "\t28 & 3039596 & 0 & 1226005 & 50.0 & H & 14084 & 257.0 & $\\dots$ \\\\\n",
+ "\t29 & 3057383 & 0 & 1570940 & 25.0 & H & 15377 & 555.0 & $\\dots$ \\\\\n",
+ "\t30 & 3044025 & 0 & 1311408 & 25.0 & H & 15497 & 490.0 & $\\dots$ \\\\\n",
+ "\t$\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & \\\\\n",
+ "\\end{tabular}\n",
+ "\n",
+ "$\\dots$\n",
+ "\n",
+ "Last Group (1 row): isFraud = 1, ProductCD = \"S\", card4 = \"visa\", card6 = \"credit\", id\\_15 = \"New\", id\\_31 = \"mobile safari 11.0\"\n",
+ "\n",
+ "\\begin{tabular}{r|cccccccc}\n",
+ "\t& TransactionID & isFraud & TransactionDT & TransactionAmt & ProductCD & card1 & card2 & \\\\\n",
+ "\t\\hline\n",
+ "\t& Int64 & Int64 & Int64 & Float64 & String & Int64 & Float64? & \\\\\n",
+ "\t\\hline\n",
+ "\t1 & 3216693 & 1 & 5439563 & 25.0 & S & 18375 & 174.0 & $\\dots$ \\\\\n",
+ "\\end{tabular}\n"
+ ],
+ "text/plain": [
+ "GroupedDataFrame with 4553 groups based on keys: isFraud, ProductCD, card4, card6, id_15, id_31\n",
+ "First Group (136 rows): isFraud = 0, ProductCD = \"H\", card4 = \"visa\", card6 = \"debit\", id_15 = \"Found\", id_31 = \"firefox 57.0\". Omitted printing of 429 columns\n",
+ "│ Row │ TransactionID │ isFraud │ TransactionDT │ TransactionAmt │ ProductCD │\n",
+ "│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mString\u001b[39m │\n",
+ "├─────┼───────────────┼─────────┼───────────────┼────────────────┼───────────┤\n",
+ "│ 1 │ 3067453 │ 0 │ 1729328 │ 200.0 │ H │\n",
+ "│ 2 │ 3073296 │ 0 │ 1816710 │ 100.0 │ H │\n",
+ "│ 3 │ 3061872 │ 0 │ 1642116 │ 100.0 │ H │\n",
+ "│ 4 │ 3078930 │ 0 │ 1902540 │ 100.0 │ H │\n",
+ "│ 5 │ 3038788 │ 0 │ 1211592 │ 75.0 │ H │\n",
+ "│ 6 │ 3056336 │ 0 │ 1556463 │ 25.0 │ H │\n",
+ "│ 7 │ 3026050 │ 0 │ 951882 │ 50.0 │ H │\n",
+ "│ 8 │ 2999258 │ 0 │ 348274 │ 50.0 │ H │\n",
+ "│ 9 │ 2995902 │ 0 │ 260116 │ 40.0 │ H │\n",
+ "│ 10 │ 3124444 │ 0 │ 2767769 │ 50.0 │ H │\n",
+ "⋮\n",
+ "│ 126 │ 3002736 │ 0 │ 426917 │ 25.0 │ H │\n",
+ "│ 127 │ 3066817 │ 0 │ 1723040 │ 50.0 │ H │\n",
+ "│ 128 │ 3096864 │ 0 │ 2154200 │ 25.0 │ H │\n",
+ "│ 129 │ 3097062 │ 0 │ 2156024 │ 50.0 │ H │\n",
+ "│ 130 │ 2998791 │ 0 │ 341460 │ 75.0 │ H │\n",
+ "│ 131 │ 3090745 │ 0 │ 2067079 │ 30.0 │ H │\n",
+ "│ 132 │ 3022302 │ 0 │ 861372 │ 50.0 │ H │\n",
+ "│ 133 │ 3056137 │ 0 │ 1553999 │ 125.0 │ H │\n",
+ "│ 134 │ 3085711 │ 0 │ 1988068 │ 30.0 │ H │\n",
+ "│ 135 │ 3099225 │ 0 │ 2180952 │ 50.0 │ H │\n",
+ "│ 136 │ 3051141 │ 0 │ 1453322 │ 75.0 │ H │\n",
+ "⋮\n",
+ "Last Group (1 row): isFraud = 1, ProductCD = \"S\", card4 = \"visa\", card6 = \"credit\", id_15 = \"New\", id_31 = \"mobile safari 11.0\". Omitted printing of 429 columns\n",
+ "│ Row │ TransactionID │ isFraud │ TransactionDT │ TransactionAmt │ ProductCD │\n",
+ "│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mString\u001b[39m │\n",
+ "├─────┼───────────────┼─────────┼───────────────┼────────────────┼───────────┤\n",
+ "│ 1 │ 3216693 │ 1 │ 5439563 │ 25.0 │ S │"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "groupby(dff, [\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ " | isFraud | ProductCD | card4 | card6 | id_15 | id_31 | TransactionAmountMax | TransactionAmountMean |
---|
| Int64 | String | String? | String? | String? | String? | Float64 | Float64 |
---|
4,553 rows × 8 columns
1 | 0 | H | visa | debit | Found | firefox 57.0 | 300.0 | 71.1765 |
---|
2 | 0 | R | visa | credit | Found | ie 11.0 for desktop | 1000.0 | 208.58 |
---|
3 | 1 | R | visa | debit | Found | chrome 63.0 for android | 300.0 | 216.667 |
---|
4 | 0 | C | visa | credit | Found | chrome 65.0 | 410.373 | 49.3293 |
---|
5 | 0 | R | visa | debit | Found | chrome 62.0 for android | 200.0 | 98.9583 |
---|
6 | 1 | C | mastercard | credit | New | chrome 63.0 | 225.504 | 47.2223 |
---|
7 | 0 | C | mastercard | credit | Found | chrome 62.0 for android | 154.071 | 44.5909 |
---|
8 | 0 | C | mastercard | credit | Found | chrome 63.0 | 302.111 | 50.8962 |
---|
9 | 1 | C | mastercard | credit | Found | chrome 63.0 | 265.498 | 44.8802 |
---|
10 | 0 | C | mastercard | credit | New | chrome 63.0 | 302.111 | 48.2015 |
---|
11 | 0 | C | mastercard | credit | Found | safari generic | 141.158 | 39.4613 |
---|
12 | 0 | C | mastercard | credit | Unknown | safari generic | 221.54 | 51.347 |
---|
13 | 0 | C | mastercard | credit | Found | mobile safari generic | 162.676 | 53.6838 |
---|
14 | 0 | R | visa | credit | New | chrome 64.0 | 350.0 | 138.889 |
---|
15 | 0 | R | visa | credit | Found | chrome 64.0 | 450.0 | 153.112 |
---|
16 | 0 | R | visa | credit | Found | chrome 65.0 | 900.0 | 177.778 |
---|
17 | 0 | C | mastercard | credit | Found | chrome 64.0 | 220.171 | 50.7401 |
---|
18 | 1 | C | mastercard | credit | Found | chrome 65.0 for android | 185.67 | 53.1863 |
---|
19 | 0 | C | visa | debit | Found | chrome 63.0 | 422.558 | 43.4618 |
---|
20 | 0 | C | visa | debit | Found | chrome 64.0 for android | 268.265 | 28.9167 |
---|
21 | 0 | C | visa | debit | Found | firefox 57.0 | 204.633 | 44.2996 |
---|
22 | 0 | C | visa | debit | Found | chrome 65.0 for android | 230.079 | 32.405 |
---|
23 | 0 | C | visa | debit | Found | chrome 60.0 for android | 96.37 | 31.5289 |
---|
24 | 0 | C | visa | credit | Found | chrome 62.0 | 412.764 | 48.7786 |
---|
25 | 0 | C | mastercard | debit | Found | chrome 64.0 for android | 162.953 | 26.765 |
---|
26 | 0 | C | mastercard | debit | Found | chrome 65.0 for android | 283.37 | 32.5836 |
---|
27 | 0 | R | visa | credit | Found | mobile safari generic | 1000.0 | 167.918 |
---|
28 | 0 | H | visa | credit | Found | chrome 62.0 for ios | 50.0 | 40.0 |
---|
29 | 0 | H | visa | credit | Found | chrome 64.0 | 450.0 | 96.2881 |
---|
30 | 0 | S | visa | credit | New | chrome generic | 200.0 | 47.7391 |
---|
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
---|
"
+ ],
+ "text/latex": [
+ "\\begin{tabular}{r|cccccccc}\n",
+ "\t& isFraud & ProductCD & card4 & card6 & id\\_15 & id\\_31 & TransactionAmountMax & TransactionAmountMean\\\\\n",
+ "\t\\hline\n",
+ "\t& Int64 & String & String? & String? & String? & String? & Float64 & Float64\\\\\n",
+ "\t\\hline\n",
+ "\t1 & 0 & H & visa & debit & Found & firefox 57.0 & 300.0 & 71.1765 \\\\\n",
+ "\t2 & 0 & R & visa & credit & Found & ie 11.0 for desktop & 1000.0 & 208.58 \\\\\n",
+ "\t3 & 1 & R & visa & debit & Found & chrome 63.0 for android & 300.0 & 216.667 \\\\\n",
+ "\t4 & 0 & C & visa & credit & Found & chrome 65.0 & 410.373 & 49.3293 \\\\\n",
+ "\t5 & 0 & R & visa & debit & Found & chrome 62.0 for android & 200.0 & 98.9583 \\\\\n",
+ "\t6 & 1 & C & mastercard & credit & New & chrome 63.0 & 225.504 & 47.2223 \\\\\n",
+ "\t7 & 0 & C & mastercard & credit & Found & chrome 62.0 for android & 154.071 & 44.5909 \\\\\n",
+ "\t8 & 0 & C & mastercard & credit & Found & chrome 63.0 & 302.111 & 50.8962 \\\\\n",
+ "\t9 & 1 & C & mastercard & credit & Found & chrome 63.0 & 265.498 & 44.8802 \\\\\n",
+ "\t10 & 0 & C & mastercard & credit & New & chrome 63.0 & 302.111 & 48.2015 \\\\\n",
+ "\t11 & 0 & C & mastercard & credit & Found & safari generic & 141.158 & 39.4613 \\\\\n",
+ "\t12 & 0 & C & mastercard & credit & Unknown & safari generic & 221.54 & 51.347 \\\\\n",
+ "\t13 & 0 & C & mastercard & credit & Found & mobile safari generic & 162.676 & 53.6838 \\\\\n",
+ "\t14 & 0 & R & visa & credit & New & chrome 64.0 & 350.0 & 138.889 \\\\\n",
+ "\t15 & 0 & R & visa & credit & Found & chrome 64.0 & 450.0 & 153.112 \\\\\n",
+ "\t16 & 0 & R & visa & credit & Found & chrome 65.0 & 900.0 & 177.778 \\\\\n",
+ "\t17 & 0 & C & mastercard & credit & Found & chrome 64.0 & 220.171 & 50.7401 \\\\\n",
+ "\t18 & 1 & C & mastercard & credit & Found & chrome 65.0 for android & 185.67 & 53.1863 \\\\\n",
+ "\t19 & 0 & C & visa & debit & Found & chrome 63.0 & 422.558 & 43.4618 \\\\\n",
+ "\t20 & 0 & C & visa & debit & Found & chrome 64.0 for android & 268.265 & 28.9167 \\\\\n",
+ "\t21 & 0 & C & visa & debit & Found & firefox 57.0 & 204.633 & 44.2996 \\\\\n",
+ "\t22 & 0 & C & visa & debit & Found & chrome 65.0 for android & 230.079 & 32.405 \\\\\n",
+ "\t23 & 0 & C & visa & debit & Found & chrome 60.0 for android & 96.37 & 31.5289 \\\\\n",
+ "\t24 & 0 & C & visa & credit & Found & chrome 62.0 & 412.764 & 48.7786 \\\\\n",
+ "\t25 & 0 & C & mastercard & debit & Found & chrome 64.0 for android & 162.953 & 26.765 \\\\\n",
+ "\t26 & 0 & C & mastercard & debit & Found & chrome 65.0 for android & 283.37 & 32.5836 \\\\\n",
+ "\t27 & 0 & R & visa & credit & Found & mobile safari generic & 1000.0 & 167.918 \\\\\n",
+ "\t28 & 0 & H & visa & credit & Found & chrome 62.0 for ios & 50.0 & 40.0 \\\\\n",
+ "\t29 & 0 & H & visa & credit & Found & chrome 64.0 & 450.0 & 96.2881 \\\\\n",
+ "\t30 & 0 & S & visa & credit & New & chrome generic & 200.0 & 47.7391 \\\\\n",
+ "\t$\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ \\\\\n",
+ "\\end{tabular}\n"
+ ],
+ "text/plain": [
+ "4553×8 DataFrame. Omitted printing of 1 columns\n",
+ "│ Row │ isFraud │ ProductCD │ card4 │ card6 │ id_15 │ id_31 │ TransactionAmountMax │\n",
+ "│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mString?\u001b[39m │ \u001b[90mString?\u001b[39m │ \u001b[90mString?\u001b[39m │ \u001b[90mUnion{Missing, String}\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n",
+ "├──────┼─────────┼───────────┼──────────────────┼─────────┼─────────┼─────────────────────────┼──────────────────────┤\n",
+ "│ 1 │ 0 │ H │ visa │ debit │ Found │ firefox 57.0 │ 300.0 │\n",
+ "│ 2 │ 0 │ R │ visa │ credit │ Found │ ie 11.0 for desktop │ 1000.0 │\n",
+ "│ 3 │ 1 │ R │ visa │ debit │ Found │ chrome 63.0 for android │ 300.0 │\n",
+ "│ 4 │ 0 │ C │ visa │ credit │ Found │ chrome 65.0 │ 410.373 │\n",
+ "│ 5 │ 0 │ R │ visa │ debit │ Found │ chrome 62.0 for android │ 200.0 │\n",
+ "│ 6 │ 1 │ C │ mastercard │ credit │ New │ chrome 63.0 │ 225.504 │\n",
+ "│ 7 │ 0 │ C │ mastercard │ credit │ Found │ chrome 62.0 for android │ 154.071 │\n",
+ "│ 8 │ 0 │ C │ mastercard │ credit │ Found │ chrome 63.0 │ 302.111 │\n",
+ "│ 9 │ 1 │ C │ mastercard │ credit │ Found │ chrome 63.0 │ 265.498 │\n",
+ "│ 10 │ 0 │ C │ mastercard │ credit │ New │ chrome 63.0 │ 302.111 │\n",
+ "⋮\n",
+ "│ 4543 │ 0 │ R │ mastercard │ credit │ New │ chrome 60.0 for android │ 100.0 │\n",
+ "│ 4544 │ 0 │ H │ \u001b[90mmissing\u001b[39m │ debit │ New │ mobile safari 11.0 │ 50.0 │\n",
+ "│ 4545 │ 0 │ R │ visa │ credit │ Unknown │ chrome 61.0 │ 100.0 │\n",
+ "│ 4546 │ 0 │ H │ visa │ credit │ New │ edge 17.0 │ 75.0 │\n",
+ "│ 4547 │ 0 │ H │ visa │ credit │ New │ mobile │ 50.0 │\n",
+ "│ 4548 │ 0 │ R │ american express │ credit │ New │ opera 49.0 │ 200.0 │\n",
+ "│ 4549 │ 0 │ R │ visa │ debit │ Found │ opera │ 100.0 │\n",
+ "│ 4550 │ 1 │ C │ mastercard │ credit │ New │ chrome 59.0 │ 205.682 │\n",
+ "│ 4551 │ 0 │ C │ \u001b[90mmissing\u001b[39m │ \u001b[90mmissing\u001b[39m │ New │ samsung browser 6.4 │ 32.707 │\n",
+ "│ 4552 │ 1 │ R │ visa │ credit │ New │ chrome 64.0 for android │ 300.0 │\n",
+ "│ 4553 │ 1 │ S │ visa │ credit │ New │ mobile safari 11.0 │ 25.0 │"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ENV[\"COLUMNS\"]=120\n",
+ "# this function group by `missing` as well\n",
+ "combine(groupby(dff, [\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\"]), \n",
+ " :TransactionAmt=>maximum=>:TransactionAmountMax, \n",
+ " :TransactionAmt=>mean=>:TransactionAmountMean)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Sorting Details"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ " | TransactionID | isFraud | TransactionDT | TransactionAmt | ProductCD | card1 | card2 | card3 | card4 | card5 |
---|
| Int64 | Int64 | Int64 | Float64 | String | Int64 | Float64? | Float64? | String? | Float64? |
---|
144,233 rows × 434 columns (omitted printing of 424 columns)
1 | 3230924 | 0 | 5787419 | 23.443 | C | 1000 | 555.0 | 185.0 | mastercard | 224.0 |
---|
2 | 3020767 | 0 | 842821 | 150.0 | R | 1004 | 583.0 | 150.0 | visa | 226.0 |
---|
3 | 3028973 | 0 | 1022173 | 30.0 | H | 1004 | 583.0 | 150.0 | visa | 226.0 |
---|
4 | 3386444 | 0 | 10082484 | 50.0 | H | 1004 | 583.0 | 150.0 | visa | 226.0 |
---|
5 | 3038871 | 0 | 1212802 | 50.0 | H | 1005 | 543.0 | 150.0 | mastercard | 117.0 |
---|
6 | 3234681 | 0 | 5883179 | 150.0 | R | 1006 | 399.0 | 150.0 | american express | 146.0 |
---|
7 | 3436647 | 0 | 11468973 | 100.0 | R | 1006 | 399.0 | 150.0 | american express | 146.0 |
---|
8 | 3095681 | 0 | 2145214 | 150.0 | R | 1006 | 399.0 | 150.0 | american express | 146.0 |
---|
9 | 3021401 | 0 | 850730 | 23.203 | C | 1007 | 555.0 | 135.0 | mastercard | 224.0 |
---|
10 | 3226241 | 0 | 5651177 | 55.164 | C | 1007 | 555.0 | 135.0 | mastercard | 224.0 |
---|
11 | 3039439 | 0 | 1222657 | 200.0 | R | 1009 | 399.0 | 150.0 | american express | 146.0 |
---|
12 | 3069943 | 0 | 1786013 | 50.0 | H | 1009 | 399.0 | 150.0 | american express | 146.0 |
---|
13 | 3054866 | 0 | 1539008 | 50.0 | H | 1009 | 399.0 | 150.0 | american express | 146.0 |
---|
14 | 3024078 | 0 | 926293 | 150.0 | R | 1009 | 399.0 | 150.0 | american express | 146.0 |
---|
15 | 2999616 | 0 | 354228 | 200.0 | R | 1009 | 399.0 | 150.0 | american express | 146.0 |
---|
16 | 3378279 | 0 | 9826142 | 277.932 | C | 1010 | 555.0 | 121.0 | visa | 226.0 |
---|
17 | 3319221 | 0 | 8183988 | 25.0 | H | 1011 | 543.0 | 150.0 | mastercard | 224.0 |
---|
18 | 3015198 | 0 | 702331 | 50.0 | H | 1012 | 479.0 | 150.0 | visa | 162.0 |
---|
19 | 3008268 | 0 | 562205 | 100.0 | H | 1012 | 479.0 | 150.0 | visa | 162.0 |
---|
20 | 3524124 | 0 | 14154216 | 75.0 | H | 1012 | 479.0 | 150.0 | visa | 162.0 |
---|
21 | 3098185 | 0 | 2166984 | 175.0 | R | 1012 | 479.0 | 150.0 | visa | 162.0 |
---|
22 | 3569931 | 0 | 15573121 | 19.92 | C | 1014 | 555.0 | 117.0 | visa | 226.0 |
---|
23 | 3563146 | 0 | 15300736 | 43.651 | C | 1014 | 555.0 | 117.0 | visa | 226.0 |
---|
24 | 3455404 | 1 | 12076280 | 450.0 | R | 1015 | 555.0 | 144.0 | mastercard | 224.0 |
---|
25 | 3393951 | 0 | 10275141 | 23.564 | C | 1015 | 555.0 | 144.0 | mastercard | 224.0 |
---|
26 | 3068829 | 0 | 1751791 | 63.6 | C | 1015 | 555.0 | 144.0 | mastercard | 224.0 |
---|
27 | 3080080 | 0 | 1919379 | 108.205 | C | 1015 | 555.0 | 144.0 | mastercard | 224.0 |
---|
28 | 3093683 | 0 | 2128731 | 31.132 | C | 1015 | 555.0 | 144.0 | mastercard | 224.0 |
---|
29 | 3159857 | 0 | 3734666 | 22.037 | C | 1015 | 555.0 | 144.0 | mastercard | 224.0 |
---|
30 | 3331629 | 0 | 8498739 | 22.96 | C | 1015 | 555.0 | 144.0 | mastercard | 224.0 |
---|
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
---|
"
+ ],
+ "text/latex": [
+ "\\begin{tabular}{r|ccccccccccc}\n",
+ "\t& TransactionID & isFraud & TransactionDT & TransactionAmt & ProductCD & card1 & card2 & card3 & card4 & card5 & \\\\\n",
+ "\t\\hline\n",
+ "\t& Int64 & Int64 & Int64 & Float64 & String & Int64 & Float64? & Float64? & String? & Float64? & \\\\\n",
+ "\t\\hline\n",
+ "\t1 & 3230924 & 0 & 5787419 & 23.443 & C & 1000 & 555.0 & 185.0 & mastercard & 224.0 & $\\dots$ \\\\\n",
+ "\t2 & 3020767 & 0 & 842821 & 150.0 & R & 1004 & 583.0 & 150.0 & visa & 226.0 & $\\dots$ \\\\\n",
+ "\t3 & 3028973 & 0 & 1022173 & 30.0 & H & 1004 & 583.0 & 150.0 & visa & 226.0 & $\\dots$ \\\\\n",
+ "\t4 & 3386444 & 0 & 10082484 & 50.0 & H & 1004 & 583.0 & 150.0 & visa & 226.0 & $\\dots$ \\\\\n",
+ "\t5 & 3038871 & 0 & 1212802 & 50.0 & H & 1005 & 543.0 & 150.0 & mastercard & 117.0 & $\\dots$ \\\\\n",
+ "\t6 & 3234681 & 0 & 5883179 & 150.0 & R & 1006 & 399.0 & 150.0 & american express & 146.0 & $\\dots$ \\\\\n",
+ "\t7 & 3436647 & 0 & 11468973 & 100.0 & R & 1006 & 399.0 & 150.0 & american express & 146.0 & $\\dots$ \\\\\n",
+ "\t8 & 3095681 & 0 & 2145214 & 150.0 & R & 1006 & 399.0 & 150.0 & american express & 146.0 & $\\dots$ \\\\\n",
+ "\t9 & 3021401 & 0 & 850730 & 23.203 & C & 1007 & 555.0 & 135.0 & mastercard & 224.0 & $\\dots$ \\\\\n",
+ "\t10 & 3226241 & 0 & 5651177 & 55.164 & C & 1007 & 555.0 & 135.0 & mastercard & 224.0 & $\\dots$ \\\\\n",
+ "\t11 & 3039439 & 0 & 1222657 & 200.0 & R & 1009 & 399.0 & 150.0 & american express & 146.0 & $\\dots$ \\\\\n",
+ "\t12 & 3069943 & 0 & 1786013 & 50.0 & H & 1009 & 399.0 & 150.0 & american express & 146.0 & $\\dots$ \\\\\n",
+ "\t13 & 3054866 & 0 & 1539008 & 50.0 & H & 1009 & 399.0 & 150.0 & american express & 146.0 & $\\dots$ \\\\\n",
+ "\t14 & 3024078 & 0 & 926293 & 150.0 & R & 1009 & 399.0 & 150.0 & american express & 146.0 & $\\dots$ \\\\\n",
+ "\t15 & 2999616 & 0 & 354228 & 200.0 & R & 1009 & 399.0 & 150.0 & american express & 146.0 & $\\dots$ \\\\\n",
+ "\t16 & 3378279 & 0 & 9826142 & 277.932 & C & 1010 & 555.0 & 121.0 & visa & 226.0 & $\\dots$ \\\\\n",
+ "\t17 & 3319221 & 0 & 8183988 & 25.0 & H & 1011 & 543.0 & 150.0 & mastercard & 224.0 & $\\dots$ \\\\\n",
+ "\t18 & 3015198 & 0 & 702331 & 50.0 & H & 1012 & 479.0 & 150.0 & visa & 162.0 & $\\dots$ \\\\\n",
+ "\t19 & 3008268 & 0 & 562205 & 100.0 & H & 1012 & 479.0 & 150.0 & visa & 162.0 & $\\dots$ \\\\\n",
+ "\t20 & 3524124 & 0 & 14154216 & 75.0 & H & 1012 & 479.0 & 150.0 & visa & 162.0 & $\\dots$ \\\\\n",
+ "\t21 & 3098185 & 0 & 2166984 & 175.0 & R & 1012 & 479.0 & 150.0 & visa & 162.0 & $\\dots$ \\\\\n",
+ "\t22 & 3569931 & 0 & 15573121 & 19.92 & C & 1014 & 555.0 & 117.0 & visa & 226.0 & $\\dots$ \\\\\n",
+ "\t23 & 3563146 & 0 & 15300736 & 43.651 & C & 1014 & 555.0 & 117.0 & visa & 226.0 & $\\dots$ \\\\\n",
+ "\t24 & 3455404 & 1 & 12076280 & 450.0 & R & 1015 & 555.0 & 144.0 & mastercard & 224.0 & $\\dots$ \\\\\n",
+ "\t25 & 3393951 & 0 & 10275141 & 23.564 & C & 1015 & 555.0 & 144.0 & mastercard & 224.0 & $\\dots$ \\\\\n",
+ "\t26 & 3068829 & 0 & 1751791 & 63.6 & C & 1015 & 555.0 & 144.0 & mastercard & 224.0 & $\\dots$ \\\\\n",
+ "\t27 & 3080080 & 0 & 1919379 & 108.205 & C & 1015 & 555.0 & 144.0 & mastercard & 224.0 & $\\dots$ \\\\\n",
+ "\t28 & 3093683 & 0 & 2128731 & 31.132 & C & 1015 & 555.0 & 144.0 & mastercard & 224.0 & $\\dots$ \\\\\n",
+ "\t29 & 3159857 & 0 & 3734666 & 22.037 & C & 1015 & 555.0 & 144.0 & mastercard & 224.0 & $\\dots$ \\\\\n",
+ "\t30 & 3331629 & 0 & 8498739 & 22.96 & C & 1015 & 555.0 & 144.0 & mastercard & 224.0 & $\\dots$ \\\\\n",
+ "\t$\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ & \\\\\n",
+ "\\end{tabular}\n"
+ ],
+ "text/plain": [
+ "144233×434 DataFrame. Omitted printing of 426 columns\n",
+ "│ Row │ TransactionID │ isFraud │ TransactionDT │ TransactionAmt │ ProductCD │ card1 │ card2 │ card3 │\n",
+ "│ │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mFloat64?\u001b[39m │ \u001b[90mFloat64?\u001b[39m │\n",
+ "├────────┼───────────────┼─────────┼───────────────┼────────────────┼───────────┼───────┼──────────┼──────────┤\n",
+ "│ 1 │ 3230924 │ 0 │ 5787419 │ 23.443 │ C │ 1000 │ 555.0 │ 185.0 │\n",
+ "│ 2 │ 3020767 │ 0 │ 842821 │ 150.0 │ R │ 1004 │ 583.0 │ 150.0 │\n",
+ "│ 3 │ 3028973 │ 0 │ 1022173 │ 30.0 │ H │ 1004 │ 583.0 │ 150.0 │\n",
+ "│ 4 │ 3386444 │ 0 │ 10082484 │ 50.0 │ H │ 1004 │ 583.0 │ 150.0 │\n",
+ "│ 5 │ 3038871 │ 0 │ 1212802 │ 50.0 │ H │ 1005 │ 543.0 │ 150.0 │\n",
+ "│ 6 │ 3234681 │ 0 │ 5883179 │ 150.0 │ R │ 1006 │ 399.0 │ 150.0 │\n",
+ "│ 7 │ 3436647 │ 0 │ 11468973 │ 100.0 │ R │ 1006 │ 399.0 │ 150.0 │\n",
+ "│ 8 │ 3095681 │ 0 │ 2145214 │ 150.0 │ R │ 1006 │ 399.0 │ 150.0 │\n",
+ "│ 9 │ 3021401 │ 0 │ 850730 │ 23.203 │ C │ 1007 │ 555.0 │ 135.0 │\n",
+ "│ 10 │ 3226241 │ 0 │ 5651177 │ 55.164 │ C │ 1007 │ 555.0 │ 135.0 │\n",
+ "⋮\n",
+ "│ 144223 │ 3449153 │ 0 │ 11892549 │ 4.795 │ C │ 18377 │ 555.0 │ 131.0 │\n",
+ "│ 144224 │ 3453522 │ 0 │ 12008182 │ 24.715 │ C │ 18377 │ 555.0 │ 131.0 │\n",
+ "│ 144225 │ 3464285 │ 0 │ 12352490 │ 3.873 │ C │ 18377 │ 555.0 │ 131.0 │\n",
+ "│ 144226 │ 3464986 │ 0 │ 12401894 │ 13.034 │ C │ 18377 │ 555.0 │ 131.0 │\n",
+ "│ 144227 │ 3469775 │ 0 │ 12544573 │ 45.372 │ C │ 18377 │ 555.0 │ 131.0 │\n",
+ "│ 144228 │ 3472353 │ 0 │ 12628755 │ 67.383 │ C │ 18377 │ 555.0 │ 131.0 │\n",
+ "│ 144229 │ 3140430 │ 0 │ 3178997 │ 100.0 │ H │ 18378 │ 111.0 │ 150.0 │\n",
+ "│ 144230 │ 3077370 │ 0 │ 1887400 │ 150.0 │ R │ 18383 │ 128.0 │ 150.0 │\n",
+ "│ 144231 │ 3304239 │ 0 │ 7910291 │ 25.0 │ S │ 18384 │ 543.0 │ 150.0 │\n",
+ "│ 144232 │ 3058006 │ 0 │ 1604565 │ 100.0 │ H │ 18388 │ 555.0 │ 150.0 │\n",
+ "│ 144233 │ 3102181 │ 0 │ 2231501 │ 100.0 │ R │ 18396 │ 111.0 │ 150.0 │"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sort!(dff, [\"card1\",\"addr1\",\"D9\"])"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Julia 1.4.1",
+ "language": "julia",
+ "name": "julia-1.4"
+ },
+ "language_info": {
+ "file_extension": ".jl",
+ "mimetype": "application/julia",
+ "name": "julia",
+ "version": "1.4.1"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {},
+ "toc_section_display": true,
+ "toc_window_display": false
+ },
+ "varInspector": {
+ "cols": {
+ "lenName": 16,
+ "lenType": 16,
+ "lenVar": 40
+ },
+ "kernels_config": {
+ "python": {
+ "delete_cmd_postfix": "",
+ "delete_cmd_prefix": "del ",
+ "library": "var_list.py",
+ "varRefreshCmd": "print(var_dic_list())"
+ },
+ "r": {
+ "delete_cmd_postfix": ") ",
+ "delete_cmd_prefix": "rm(",
+ "library": "var_list.r",
+ "varRefreshCmd": "cat(var_dic_list()) "
+ }
+ },
+ "types_to_exclude": [
+ "module",
+ "function",
+ "builtin_function_or_method",
+ "instance",
+ "_Feature"
+ ],
+ "window_display": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Julia/CSV/Read_CSV.ipynb b/Julia/CSV/Read_CSV.ipynb
index 06e27f8..4b60d4a 100644
--- a/Julia/CSV/Read_CSV.ipynb
+++ b/Julia/CSV/Read_CSV.ipynb
@@ -5,6 +5,9 @@
"metadata": {},
"source": [
"## Reading a CSV file to a DataFrame in Julia (programing lang)\n",
+ "\n",
+ "Article: https://towardsdatascience.com/read-csv-to-data-frame-in-julia-programming-lang-77f3d0081c14\n",
+ "\n",
"Julia often offer several ways how to do the same thing and reading CSV is an example. In all cases, you will need the `CSV` and `DataFrames` package. If you don't have them installed, in the Julia REPL run: `import Pkg; Pkg.add(\"CSV\"); Pkg.add(\"DataFrames\")`"
]
},