import numpy as np
import pandas as pd
import pyarrow as pa

DATA_SIZE = (1 << 30)
NCOLS = 100

data = np.random.randn(NCOLS, DATA_SIZE / NCOLS / 8).T
data[::2] = np.nan

df = pd.DataFrame(data, columns=['c' + str(i) for i in range(NCOLS)])

table = pa.Table.from_pandas(df)
/home/wesm/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:8: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
%timeit df2 = table.to_pandas(nthreads=1)
10 loops, best of 3: 152 ms per loop
%timeit df2 = table.to_pandas(nthreads=4)
10 loops, best of 3: 103 ms per loop
%timeit df2 = table.to_pandas(nthreads=8)
10 loops, best of 3: 175 ms per loop
%timeit df2 = table.to_pandas()
1 loop, best of 3: 497 ms per loop
%timeit df2 = table.to_pandas(block_based=False)
1 loop, best of 3: 946 ms per loop
%timeit df2 = table.to_pandas(4)
1 loop, best of 3: 144 ms per loop
pd.util.testing.assert_almost_equal(df, df2)
import numpy as np
import pandas as pd
import pyarrow as pa

type_ = np.dtype('float64')
DATA_SIZE = (1 << 30)
NCOLS = 100
NROWS = DATA_SIZE / NCOLS / np.dtype(type_).itemsize

data = {
    'c' + str(i): np.random.randn(NROWS)
    for i in range(NCOLS)
}
/home/wesm/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:12: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
%timeit df = pd.DataFrame(data)
10 loops, best of 3: 130 ms per loop
df = pd.DataFrame(data)
df.values[::5] = np.nan
table = pa.Table.from_pandas(df)
table
<pyarrow.table.Table at 0x7f1a68a93b40>
table.column(0)
<pyarrow.table.Column at 0x7f1a68a6b900>
%timeit df2 = table.to_pandas(nthreads=1)
10 loops, best of 3: 158 ms per loop
%timeit df2 = table.to_pandas(nthreads=4)
10 loops, best of 3: 107 ms per loop
arr = data['c0']
arr = data['c0']
%timeit for i in range(100): arr2 = arr.copy()
10 loops, best of 3: 73.3 ms per loop