import time
import numpy as np
import pandas as pd
import pyarrow as pa

def generate_data(total_size, ncols):
    nrows = int(total_size / ncols / np.dtype('float64').itemsize)
    return pd.DataFrame({
        'c' + str(i): np.random.randn(nrows)
        for i in range(ncols)
    })

KILOBYTE = 1 << 10
MEGABYTE = KILOBYTE * KILOBYTE
DATA_SIZE = 1024 * MEGABYTE
NCOLS = 16

def get_timing(f, niter):
    start = time.clock_gettime(time.CLOCK_REALTIME)
    for i in range(niter):
        f()
    return (time.clock_gettime(time.CLOCK_REALTIME) - start) / NITER

def read_as_dataframe(klass, source):
    reader = klass(source)
    table = reader.read_all()
    return table.to_pandas()

NITER = 10
results = []

CHUNKSIZES = [
    ('16K', 16 * KILOBYTE), 
    ('64K', 64 * KILOBYTE), 
    ('256K', 256 * KILOBYTE), 
    ('1M', MEGABYTE), 
    ('16M', 16 * MEGABYTE)]

for name, chunksize in CHUNKSIZES:
    nchunks = DATA_SIZE // chunksize
    batch = pa.RecordBatch.from_pandas(generate_data(chunksize, NCOLS))

    sink = pa.InMemoryOutputStream()
    stream_writer = pa.StreamWriter(sink, batch.schema)

    for i in range(nchunks):
        stream_writer.write_batch(batch)

    source = sink.get_result()

    elapsed = get_timing(lambda: read_as_dataframe(pa.StreamReader, source), NITER)

    result = (name, chunksize, elapsed)
    print(result)
    results.append(result)
('16K', 16384, 1.3756619691848755)
('64K', 65536, 0.4991825819015503)
('256K', 262144, 0.17925310134887695)
('1M', 1048576, 0.13184106349945068)
('16M', 16777216, 0.14622862339019777)
import seaborn as sns
sns.barplot?
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

results = pd.DataFrame.from_records(results, columns=['size', 'nbytes', 'timing'])
key = 'Gigabytes Per Second'
results[key] = DATA_SIZE / results['timing'] / MEGABYTE / 1024
results

fig = plt.figure(figsize=(12, 6))

title = 'Streaming data throughput by chunksize'
g = sns.factorplot(x=key, y='size', data=results, kind='bar', orient='h', size=10)
g.despine(left=True)
#g.fig.get_axes()[0].set_xscale('log', basex=2)
g.fig.set_size_inches(12, 4)
plt.title(title)
g.fig.set_tight_layout(True)

plt.savefig('arrow_streaming_benchmarks.png')
<matplotlib.figure.Figure at 0x7f9175f7f5f8>
/home/wesm/anaconda3/lib/python3.5/site-packages/matplotlib/figure.py:1744: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "