Updates to csv reader comparison
parent
88aae54fb6
commit
c0762e987a
File diff suppressed because it is too large
Load Diff
|
@ -67,7 +67,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -78,7 +78,7 @@
|
|||
" \"train_identity.csv\""
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -110,21 +110,30 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"┌ Warning: inner joining data frames using join is deprecated, use `innerjoin(df1, df2, on=TransactionID, makeunique=false, validate=(false, false))` instead\n",
|
||||
"│ caller = ip:0x0\n",
|
||||
"└ @ Core :-1\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Dict{Any,Any} with 5 entries:\n",
|
||||
" \"merge\" => 0.771\n",
|
||||
" \"sort\" => 5.032\n",
|
||||
" \"load_transactions\" => 8.045\n",
|
||||
" \"aggregation\" => 0.034\n",
|
||||
" \"load_identity\" => 0.502"
|
||||
" \"merge\" => 6.369\n",
|
||||
" \"sort\" => 6.896\n",
|
||||
" \"load_transactions\" => 28.601\n",
|
||||
" \"aggregation\" => 6.078\n",
|
||||
" \"load_identity\" => 0.307"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -174,13 +183,13 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<table class=\"data-frame\"><thead><tr><th></th><th>aggregation</th><th>load_identity</th><th>load_transactions</th><th>merge</th><th>sort</th></tr><tr><th></th><th>Float64</th><th>Float64</th><th>Float64</th><th>Float64</th><th>Float64</th></tr></thead><tbody><p>1 rows × 5 columns</p><tr><th>1</th><td>0.034</td><td>0.502</td><td>8.045</td><td>0.771</td><td>5.032</td></tr></tbody></table>"
|
||||
"<table class=\"data-frame\"><thead><tr><th></th><th>aggregation</th><th>load_identity</th><th>load_transactions</th><th>merge</th><th>sort</th></tr><tr><th></th><th>Float64</th><th>Float64</th><th>Float64</th><th>Float64</th><th>Float64</th></tr></thead><tbody><p>1 rows × 5 columns</p><tr><th>1</th><td>6.078</td><td>0.307</td><td>28.601</td><td>6.369</td><td>6.896</td></tr></tbody></table>"
|
||||
],
|
||||
"text/latex": [
|
||||
"\\begin{tabular}{r|ccccc}\n",
|
||||
|
@ -188,7 +197,7 @@
|
|||
"\t\\hline\n",
|
||||
"\t& Float64 & Float64 & Float64 & Float64 & Float64\\\\\n",
|
||||
"\t\\hline\n",
|
||||
"\t1 & 0.034 & 0.502 & 8.045 & 0.771 & 5.032 \\\\\n",
|
||||
"\t1 & 6.078 & 0.307 & 28.601 & 6.369 & 6.896 \\\\\n",
|
||||
"\\end{tabular}\n"
|
||||
],
|
||||
"text/plain": [
|
||||
|
@ -196,10 +205,10 @@
|
|||
"│ Row │ aggregation │ load_identity │ load_transactions │ merge │ sort │\n",
|
||||
"│ │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n",
|
||||
"├─────┼─────────────┼───────────────┼───────────────────┼─────────┼─────────┤\n",
|
||||
"│ 1 │ 0.034 │ 0.502 │ 8.045 │ 0.771 │ 5.032 │"
|
||||
"│ 1 │ 6.078 │ 0.307 │ 28.601 │ 6.369 │ 6.896 │"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -531,6 +540,126 @@
|
|||
"source": [
|
||||
"sort!(dff, [\"card1\",\"addr1\",\"D9\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Run multiple times"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"run_julia (generic function with 1 method)"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"function run_julia()\n",
|
||||
" s = Dict()\n",
|
||||
" f = open(\"julia.csv\",\"a\")\n",
|
||||
"\n",
|
||||
" # load transactions ~600MB\n",
|
||||
" ts = now()\n",
|
||||
" df = CSV.read(joinpath(folder,files[1]), DataFrame)\n",
|
||||
" te = now()\n",
|
||||
" time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)\n",
|
||||
" push!(s, \"load_transactions\"=>time_in_sec)\n",
|
||||
" write(f,string(Dates.format(now(), \"YYYY-mm-dd HH:MM:SS\"),\"|julia|load_transactions|\",time_in_sec,\"\\n\"))\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" # load identity ~25MB\n",
|
||||
" ts = now()\n",
|
||||
" df2 = CSV.read(joinpath(folder,files[2]), DataFrame)\n",
|
||||
" te = now()\n",
|
||||
" time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)\n",
|
||||
" push!(s, \"load_identity\"=>time_in_sec)\n",
|
||||
" write(f,string(Dates.format(now(), \"YYYY-mm-dd HH:MM:SS\"),\"|julia|load_identity|\",time_in_sec,\"\\n\"))\n",
|
||||
"\n",
|
||||
" # join\n",
|
||||
" ts = now()\n",
|
||||
" dff = join(df, df2, kind = :inner, on = \"TransactionID\")\n",
|
||||
" te = now()\n",
|
||||
" time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)\n",
|
||||
" push!(s, \"merge\"=>time_in_sec)\n",
|
||||
" write(f,string(Dates.format(now(), \"YYYY-mm-dd HH:MM:SS\"),\"|julia|merge|\",time_in_sec,\"\\n\"))\n",
|
||||
"\n",
|
||||
" # group by\n",
|
||||
" ts = now()\n",
|
||||
" grp = combine(groupby(dff, [\"isFraud\",\"ProductCD\",\"card4\",\"card6\",\"id_15\",\"id_31\"]), \n",
|
||||
" :TransactionAmt=>maximum=>:TransactionAmountMax, \n",
|
||||
" :TransactionAmt=>mean=>:TransactionAmountMean)\n",
|
||||
" te = now()\n",
|
||||
" time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)\n",
|
||||
" push!(s, \"aggregation\"=>time_in_sec)\n",
|
||||
" write(f,string(Dates.format(now(), \"YYYY-mm-dd HH:MM:SS\"),\"|julia|aggregation|\",time_in_sec,\"\\n\"))\n",
|
||||
"\n",
|
||||
" # group by\n",
|
||||
" ts = now()\n",
|
||||
" sort!(dff, [\"card1\",\"addr1\",\"D9\"])\n",
|
||||
" sort!(dff, [\"addr1\",\"D9\",\"card1\"])\n",
|
||||
" sort!(dff, [\"D9\",\"card1\",\"addr1\"])\n",
|
||||
" te = now()\n",
|
||||
" time_in_sec = (te-ts) / Millisecond(1) * (1 / 1000)\n",
|
||||
" push!(s, \"sort\"=>time_in_sec)\n",
|
||||
" write(f,string(Dates.format(now(), \"YYYY-mm-dd HH:MM:SS\"),\"|julia|sorting|\",time_in_sec,\"\\n\"))\n",
|
||||
" \n",
|
||||
" close(f)\n",
|
||||
" return s\n",
|
||||
"end"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"┌ Warning: inner joining data frames using join is deprecated, use `innerjoin(df1, df2, on=TransactionID, makeunique=false, validate=(false, false))` instead\n",
|
||||
"│ caller = ip:0x0\n",
|
||||
"└ @ Core :-1\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for i in 1:7\n",
|
||||
" run_julia()\n",
|
||||
"end"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 52,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\"2021-01-21 22:56:32|julia|step|6.896\""
|
||||
]
|
||||
},
|
||||
"execution_count": 52,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# expected csv output\n",
|
||||
"string(Dates.format(now(), \"YYYY-mm-dd HH:MM:SS\"),\"|julia|load_identity|\",time_in_sec)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
|
Loading…
Reference in New Issue