import gc
import random
import time
import pyarrow as pa
import hdfs3

DATA_SIZE = 200 * (1 << 20)
data = 'a' * DATA_SIZE

hdfs = pa.HdfsClient('localhost', 20500, 'wesm')
hdfscpp = pa.HdfsClient('localhost', 20500, 'wesm', driver='libhdfs3')
hdfs3_fs = hdfs3.HDFileSystem('localhost', port=20500, user='wesm')

hdfs.ls('/', False)
['/__ibis', '/hbase', '/home', '/hs2-perf', '/test-warehouse', '/tmp', '/user']
hdfs.delete(path)
path = '/tmp/test-data-file-1'
with hdfs.open(path, 'wb') as f:
    f.write(data)
def read_chunk(f, size):
    # do a random seek
    f.seek(random.randint(0, size))
    return f.read(size)

def ensemble_average(runner, niter=10):
    start = time.clock()
    gc.disable()
    data_chunks = []
    for i in range(niter):
        data_chunks.append(runner())
    elapsed = (time.clock() - start) / niter
    gc.enable()
    return elapsed

def make_test_func(fh, chunksize):
    def runner():
        return read_chunk(fh, chunksize)
    return runner
KB = 1024
MB = 1024 * KB
chunksizes = [4 * KB, MB, 10 * MB, 100 * MB]
iterations = [100, 100, 100, 10]

handles = {
    ('pyarrow', 'libhdfs'): hdfs.open(path),
    ('pyarrow', 'libhdfs3'): hdfscpp.open(path),
    ('hdfs3', 'libhdfs3'): hdfs3_fs.open(path, 'rb')
}

timings = []
for (library, driver), handle in handles.items():
    for chunksize, niter in zip(chunksizes, iterations):
        tester = make_test_func(handle, chunksize)
        timing = ensemble_average(tester, niter=niter)
        throughput = chunksize / timing
        
        result = (library, driver, chunksize, timing, throughput)
        print(result)
        timings.append(result)
('pyarrow', 'libhdfs3', 4096, 0.07757587, 52799.923481360885)
('pyarrow', 'libhdfs3', 1048576, 0.07873925999999982, 13317066.987929558)
('pyarrow', 'libhdfs3', 10485760, 0.08238820000000033, 127272594.86188506)
('pyarrow', 'libhdfs3', 104857600, 0.20835439999999608, 503265589.7835705)
('pyarrow', 'libhdfs', 4096, 0.0004367300000001251, 9378792.388887474)
('pyarrow', 'libhdfs', 1048576, 0.0017632700000001478, 594676935.4664414)
('pyarrow', 'libhdfs', 10485760, 0.01217960000000005, 860928109.2975103)
('pyarrow', 'libhdfs', 104857600, 0.11822549999999979, 886928792.8577183)
('hdfs3', 'libhdfs3', 4096, 0.07886105999999984, 51939.44894983669)
('hdfs3', 'libhdfs3', 1048576, 0.07959901000000003, 13173229.164533574)
('hdfs3', 'libhdfs3', 10485760, 0.08876361000000031, 118131292.76738478)
('hdfs3', 'libhdfs3', 104857600, 0.2404344999999978, 436117112.9767191)
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

results = pd.DataFrame.from_records(timings, columns=['library', 'driver', 'read_size', 'timing', 'throughput'])
results['MB/s'] = results['throughput'] / MB
results
results['type'] = results['library'] + '+' + results['driver']
plt.figure(figsize=(12, 6))
g = sns.factorplot(y='read_size', x='MB/s', hue='type', data=results, kind='bar', orient='h', size=(10))
g.despine(left=True)
#g.fig.get_axes()[0].set_xscale('log', basex=2)
g.fig.set_size_inches(12, 4)

plt.savefig('results2.png')
<matplotlib.figure.Figure at 0x7f7845ed1908>

results
library driver read_size timing throughput MB/s type
0 pyarrow libhdfs3 4096 0.077576 5.279992e+04 0.050354 pyarrow+libhdfs3
1 pyarrow libhdfs3 1048576 0.078739 1.331707e+07 12.700145 pyarrow+libhdfs3
2 pyarrow libhdfs3 10485760 0.082388 1.272726e+08 121.376605 pyarrow+libhdfs3
3 pyarrow libhdfs3 104857600 0.208354 5.032656e+08 479.951467 pyarrow+libhdfs3
4 pyarrow libhdfs 4096 0.000437 9.378792e+06 8.944313 pyarrow+libhdfs
5 pyarrow libhdfs 1048576 0.001763 5.946769e+08 567.128120 pyarrow+libhdfs
6 pyarrow libhdfs 10485760 0.012180 8.609281e+08 821.045026 pyarrow+libhdfs
7 pyarrow libhdfs 104857600 0.118225 8.869288e+08 845.841210 pyarrow+libhdfs
8 hdfs3 libhdfs3 4096 0.078861 5.193945e+04 0.049533 hdfs3+libhdfs3
9 hdfs3 libhdfs3 1048576 0.079599 1.317323e+07 12.562970 hdfs3+libhdfs3
10 hdfs3 libhdfs3 10485760 0.088764 1.181313e+08 112.658780 hdfs3+libhdfs3
11 hdfs3 libhdfs3 104857600 0.240434 4.361171e+08 415.913690 hdfs3+libhdfs3
479 / 416
1.1514423076923077