Updates to csv reader comparison

pull/4/head
vaclavdekanovsky 2021-01-21 23:16:45 +01:00
parent 88aae54fb6
commit c0762e987a
2 changed files with 642 additions and 206 deletions

File diff suppressed because it is too large Load Diff

View File

@ -67,7 +67,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"outputs": [
{
@ -78,7 +78,7 @@
" \"train_identity.csv\""
]
},
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@ -110,21 +110,30 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"┌ Warning: inner joining data frames using join is deprecated, use `innerjoin(df1, df2, on=TransactionID, makeunique=false, validate=(false, false))` instead\n",
"│ caller = ip:0x0\n",
"└ @ Core :-1\n"
]
},
{
"data": {
"text/plain": [
"Dict{Any,Any} with 5 entries:\n",
" \"merge\" => 0.771\n",
" \"sort\" => 5.032\n",
" \"load_transactions\" => 8.045\n",
" \"aggregation\" => 0.034\n",
" \"load_identity\" => 0.502"
" \"merge\" => 6.369\n",
" \"sort\" => 6.896\n",
" \"load_transactions\" => 28.601\n",
" \"aggregation\" => 6.078\n",
" \"load_identity\" => 0.307"
]
},
"execution_count": 12,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@ -174,13 +183,13 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table class=\"data-frame\"><thead><tr><th></th><th>aggregation</th><th>load_identity</th><th>load_transactions</th><th>merge</th><th>sort</th></tr><tr><th></th><th>Float64</th><th>Float64</th><th>Float64</th><th>Float64</th><th>Float64</th></tr></thead><tbody><p>1 rows × 5 columns</p><tr><th>1</th><td>0.034</td><td>0.502</td><td>8.045</td><td>0.771</td><td>5.032</td></tr></tbody></table>"
"<table class=\"data-frame\"><thead><tr><th></th><th>aggregation</th><th>load_identity</th><th>load_transactions</th><th>merge</th><th>sort</th></tr><tr><th></th><th>Float64</th><th>Float64</th><th>Float64</th><th>Float64</th><th>Float64</th></tr></thead><tbody><p>1 rows × 5 columns</p><tr><th>1</th><td>6.078</td><td>0.307</td><td>28.601</td><td>6.369</td><td>6.896</td></tr></tbody></table>"
],
"text/latex": [
"\\begin{tabular}{r|ccccc}\n",
@ -188,7 +197,7 @@
"\t\\hline\n",
"\t& Float64 & Float64 & Float64 & Float64 & Float64\\\\\n",
"\t\\hline\n",
"\t1 & 0.034 & 0.502 & 8.045 & 0.771 & 5.032 \\\\\n",
"\t1 & 6.078 & 0.307 & 28.601 & 6.369 & 6.896 \\\\\n",
"\\end{tabular}\n"
],
"text/plain": [
@ -196,10 +205,10 @@
"│ Row │ aggregation │ load_identity │ load_transactions │ merge │ sort │\n",
"│ │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n",
"├─────┼─────────────┼───────────────┼───────────────────┼─────────┼─────────┤\n",
"│ 1 │ 0.034 │ 0.502 │ 8.045 │ 0.771 │ 5.032 │"
"│ 1 │ 6.078 │ 0.307 │ 28.601 │ 6.369 │ 6.896 │"
]
},
"execution_count": 13,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@ -531,6 +540,126 @@
"source": [
"sort!(dff, [\"card1\",\"addr1\",\"D9\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Run multiple times"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"run_julia (generic function with 1 method)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"function run_julia()\n",
" s = Dict()\n",
" f = open(\"julia.csv\",\"a\")\n",
"\n",
" # load transactions ~600MB\n",
" ts = now()\n",
" df = CSV.read(joinpath(folder,files[1]), DataFrame)\n",
" te = now()\n",
" time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)\n",
" push!(s, \"load_transactions\"=>time_in_sec)\n",
" write(f,string(Dates.format(now(), \"YYYY-mm-dd HH:MM:SS\"),\"|julia|load_transactions|\",time_in_sec,\"\\n\"))\n",
" \n",
"\n",
" # load identity ~25MB\n",
" ts = now()\n",
" df2 = CSV.read(joinpath(folder,files[2]), DataFrame)\n",
" te = now()\n",
" time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)\n",
" push!(s, \"load_identity\"=>time_in_sec)\n",
" write(f,string(Dates.format(now(), \"YYYY-mm-dd HH:MM:SS\"),\"|julia|load_identity|\",time_in_sec,\"\\n\"))\n",
"\n",
" # join\n",
" ts = now()\n",
" dff = join(df, df2, kind = :inner, on = \"TransactionID\")\n",
" te = now()\n",
" time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)\n",
" push!(s, \"merge\"=>time_in_sec)\n",
" write(f,string(Dates.format(now(), \"YYYY-mm-dd HH:MM:SS\"),\"|julia|merge|\",time_in_sec,\"\\n\"))\n",
"\n",
" # group by\n",
" ts = now()\n",
" grp = combine(groupby(dff, [\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\"]), \n",
" :TransactionAmt=>maximum=>:TransactionAmountMax, \n",
" :TransactionAmt=>mean=>:TransactionAmountMean)\n",
" te = now()\n",
" time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)\n",
" push!(s, \"aggregation\"=>time_in_sec)\n",
" write(f,string(Dates.format(now(), \"YYYY-mm-dd HH:MM:SS\"),\"|julia|aggregation|\",time_in_sec,\"\\n\"))\n",
"\n",
" # group by\n",
" ts = now()\n",
" sort!(dff, [\"card1\",\"addr1\",\"D9\"])\n",
" sort!(dff, [\"addr1\",\"D9\",\"card1\"])\n",
" sort!(dff, [\"D9\",\"card1\",\"addr1\"])\n",
" te = now()\n",
" time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)\n",
" push!(s, \"sort\"=>time_in_sec)\n",
" write(f,string(Dates.format(now(), \"YYYY-mm-dd HH:MM:SS\"),\"|julia|sorting|\",time_in_sec,\"\\n\"))\n",
" \n",
" close(f)\n",
" return s\n",
"end"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"┌ Warning: inner joining data frames using join is deprecated, use `innerjoin(df1, df2, on=TransactionID, makeunique=false, validate=(false, false))` instead\n",
"│ caller = ip:0x0\n",
"└ @ Core :-1\n"
]
}
],
"source": [
"for i in 1:7\n",
" run_julia()\n",
"end"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"2021-01-21 22:56:32|julia|step|6.896\""
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# expected csv output\n",
"string(Dates.format(now(), \"YYYY-mm-dd HH:MM:SS\"),\"|julia|load_identity|\",time_in_sec)"
]
}
],
"metadata": {