In [1]:
# This cell contains some utility functions to prepare and execute the benchmarks
import timeit
from random import choice
from string import ascii_uppercase

def random_string(length):
    """Produce a random string made of *length* uppercase ascii characters"""
    return ''.join(choice(ascii_uppercase) for i in range(length))

def print_scaling(stmt, setup, sizes=[10000, 20000, 30000], repeat=False, units='us'):
    """Print scaling information for the statement *stmt*, executed after *setup*.
    
    The *setup* and *stmt* arguments take a template string where "{N}"
    will be replaced as the size of the input.
    
    The *repeat* flags determined if the setup needs to be run between
    each test run.
    """
    values = []
    for size in sizes:
        if repeat:
            timings = timeit.repeat(stmt.format(N=size),
                                    setup=setup.format(N=size),
                                    number=1, repeat=1000)
            values.append(min(timings))
        else:
            timings = timeit.repeat(stmt.format(N=size),
                                    setup=setup.format(N=size),
                                    number=1000, repeat=3)
            values.append(min(t/1000 for t in timings))
    unit_factor = {'us': 1e6,
                   'ms': 1e3}[units]
    
    print(' | '.join('N = {} t = {:.2f} ({})'.format(n, t * unit_factor, units) for n, t in zip(sizes, values)))

# Dataframes

In [3]:
print_scaling('series[1000]',
              setup='import pandas as pd; series = pd.Series(range({N}), index=range({N}))')

N = 10000 t = 12.30 (us) | N = 20000 t = 12.58 (us) | N = 30000 t = 13.30 (us)


In [8]:
setup_code = '''
import pandas as pd
import random

index = list(range({N}//2)) + list(range({N}//2))
random.seed(42)
random.shuffle(index)

series = pd.Series(range({N}), index=index)
'''
print_scaling('series[1000]',
              setup=setup_code)

N = 10000 t = 494.95 (us) | N = 20000 t = 814.10 (us) | N = 30000 t = 1129.95 (us)


In [12]:
setup_code = '''
import pandas as pd
import random

index = list(range({N}//2)) + list(range({N}//2))
random.seed(42)
random.shuffle(index)

series = pd.Series(range({N}), index=index)
series.sort_index(inplace=True)
'''
print_scaling('series[1000]',
              setup=setup_code)

N = 10000 t = 145.93 (us) | N = 20000 t = 145.81 (us) | N = 30000 t = 145.66 (us)
