# First steps with Numba

In [23]:
import numba as nb
import numpy as np

In [34]:
@nb.jit(nopython=True)
def sum_sq(a):
    N = len(a)
    
    result = 0
    for i in range(N):
        result += a[i] ** 2
    
    return result

In [35]:
x = np.random.rand(10000)

In [36]:
%timeit sum_sq.py_func(x)

100 loops, best of 3: 3.85 ms per loop


In [37]:
%timeit sum_sq(x)

The slowest run took 5773.33 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 11.6 µs per loop


In [12]:
%timeit (x**2).sum()

The slowest run took 44.19 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 14.8 µs per loop


In [26]:
x_list = x.tolist()

In [27]:
%timeit sum_sq(x_list)

The slowest run took 523.66 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 199 µs per loop


In [28]:
%timeit sum([x**2 for x in x_list])

1000 loops, best of 3: 1.28 ms per loop


## Type specializations

In [23]:
sum_sq.signatures

[]

In [5]:
x = np.random.rand(1000).astype('float64')
sum_sq(x)
sum_sq.signatures

[(array(float64, 1d, A),)]

In [15]:
x = np.random.rand(10000).astype('float32')
sum_sq(x)
sum_sq.signatures

[(array(float64, 1d, A),)]

In [41]:
@nb.jit(['float64(float64[:])',
         'float32(float32[:])'])
def sum_sq(a):
    N = len(a)
    
    result = 0
    for i in range(N):
        result += a[i] ** 2
    
    return result

In [42]:
sum_sq.signatures

[(array(float64, 1d, A),), (array(float32, 1d, A),)]

In [43]:
sum_sq.inspect_types()

sum_sq (array(float64, 1d, A),)
--------------------------------------------------------------------------------
# File: <ipython-input-41-40dcde0acb08>
# --- LINE 1 --- 
# label 0
#   del $0.1
#   del $0.3
#   del $const0.4

@nb.jit(['float64(float64[:])',

         # --- LINE 2 --- 

         'float32(float32[:])'])

# --- LINE 3 --- 

def sum_sq(a):

    # --- LINE 4 --- 
    #   a = arg(0, name=a)  :: array(float64, 1d, A)
    #   $0.1 = global(len: <built-in function len>)  :: Function(<built-in function len>)
    #   $0.3 = call $0.1(a)  :: (array(float64, 1d, A),) -> int64
    #   N = $0.3  :: int64

    N = len(a)

# --- LINE 5 --- 



    # --- LINE 6 --- 
    #   $const0.4 = const(int, 0)  :: int64
    #   result = $const0.4  :: float64
    #   jump 18
    # label 18

    result = 0

    # --- LINE 7 --- 
    #   jump 21
    # label 21
    #   $21.1 = global(range: <class 'range'>)  :: Function(<class 'range'>)
    #   $21.3 = call $21.1(N)  :: (int64,) -> range_state_int64
 

In [45]:
@nb.jit
def concatenate(strings):
    result = ''
    for s in strings:
        result += s
    return result

In [48]:
concatenate(['hello', 'world'])
concatenate.signatures
concatenate.inspect_types()

concatenate (reflected list(str),)
--------------------------------------------------------------------------------
# File: <ipython-input-45-a826ad379539>
# --- LINE 1 --- 
# label 0
#   del $const0.1

@nb.jit

# --- LINE 2 --- 

def concatenate(strings):

    # --- LINE 3 --- 
    #   strings = arg(0, name=strings)  :: pyobject
    #   $const0.1 = const(str, )  :: pyobject
    #   result = $const0.1  :: pyobject
    #   jump 6
    # label 6

    result = ''

    # --- LINE 4 --- 
    #   jump 9
    # label 9
    #   $17 = const(LiftedLoop, LiftedLoop(<function concatenate at 0x7f9939c032f0>))  :: XXX Lifted Loop XXX
    #   $18 = call $17(result, strings)  :: XXX Lifted Loop XXX
    #   del strings
    #   del $17
    #   result = static_getitem(index_var=None, index=0, value=$18)  :: pyobject
    #   del $18
    #   jump 33

    for s in strings:

        # --- LINE 5 --- 
        # label 33
        #   del result

        result += s

    # --- LINE 6 --- 
    #   $33.2 = cast(valu

In [50]:
x = ['hello'] * 1000
%timeit concatenate.py_func(x)

10000 loops, best of 3: 111 µs per loop


In [51]:
%timeit concatenate(x)

1000 loops, best of 3: 317 µs per loop


In [53]:
# Strings are mostly unsupported in nopython mode

@nb.jit(nopython=True)
def concatenate(strings):
    result = ''
    for s in strings:
        result += s
    return result

concatenate(x)

TypingError: Failed at nopython (nopython frontend)
Invalid usage of += with parameters (const(''), str)
 * parameterized
File "<ipython-input-53-aa35a1ec2403>", line 5
[1] During: typing of intrinsic-call at <ipython-input-53-aa35a1ec2403> (5)

# Ufuncs and Gufuncs

In [65]:
# Numpy vectorization
@np.vectorize
def cantor_py(a, b):
    return  int(0.5 * (a + b)*(a + b + 1) + b)

In [66]:
cantor_py(np.array([1, 2]), 2)

array([ 8, 12])

In [67]:
# Numba vectorization
@nb.vectorize
def cantor(a, b):
    return  int(0.5 * (a + b)*(a + b + 1) + b)

In [68]:
cantor(np.array([1, 2]), 2)

array([ 8, 12])

In [69]:
x1 = np.random.rand(10000)
x2 = np.random.rand(10000)

In [70]:
%timeit cantor_py(x1, x2)

100 loops, best of 3: 6.06 ms per loop


In [71]:
%timeit cantor(x1, x2)

The slowest run took 3871.58 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 15 µs per loop


In [73]:
%timeit (0.5 * (x1 + x2)*(x1 + x2 + 1) + x2).astype(int)

The slowest run took 6.92 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 57.1 µs per loop


# Matrix multiplication

In [101]:
a = np.random.rand(3, 3)
b = np.random.rand(3, 3)

c = np.matmul(a, b)
c.shape

(3, 3)

In [103]:
a = np.random.rand(10, 3, 3)
b = np.random.rand(10, 3, 3)

c = np.matmul(a, b)
c.shape

np.allclose(np.matmul(a[0], b[0]), c[0])

True

In [106]:
a = np.random.rand(10, 3, 3)
b = np.random.rand(3, 3)
c = np.matmul(a, b)
c.shape

(10, 3, 3)

In [150]:

@nb.guvectorize(['float64[:], float64[:], float64[:]'], '(n), (n) -> ()')
def euclidean(a, b, out):
    N = a.shape[0]
    out[0] = 0
    for i in range(N):
        out[0] += (a[i] - b[i])**2
    

In [151]:
a = np.random.rand(2)
b = np.random.rand(2)
c = euclidean(a, b)

a = np.random.rand(10, 2)
b = np.random.rand(10, 2)
c = euclidean(a, b)

a = np.random.rand(10, 2)
b = np.random.rand(2)
c = euclidean(a, b)

In [152]:
a = np.random.rand(10000, 2)
b = np.random.rand(10000, 2)

In [153]:
%timeit ((a - b)**2).sum(axis=1)

The slowest run took 8.33 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 200 µs per loop


In [154]:
%timeit euclidean(a, b)

10000 loops, best of 3: 36.2 µs per loop


In [157]:
np.allclose(((a - b)**2).sum(axis=1), euclidean(a, b))

True

# JIT classes

In [242]:
class Node:
    def __init__(self, value):
        self.next = None
        self.value = value


class LinkedList:
    
    def __init__(self):
        self.head = None
    
    def push_front(self, value):
        if self.head == None:
            self.head = Node(value)
        else:
            # We replace the head
            new_head = Node(value)
            new_head.next = self.head
            self.head = new_head
    
    def show(self):
        node = self.head
        while node is not None:
            print(node.value)
            node = node.next
@nb.jit   
def sum_list(lst):
    result = 0
    node = lst.head
    while node is not None:
        result += node.value
        node = node.next
    return result
        
lst = LinkedList()
lst.push_front(1)
lst.push_front(2)
lst.push_front(3)
lst.show()

sum_list(lst)

3
2
1


6

In [246]:
lst = LinkedList()
[lst.push_front(i) for i in range(10000)]

%timeit sum_list(lst)
%timeit sum_list.py_func(lst)

100 loops, best of 3: 1.75 ms per loop
1000 loops, best of 3: 2.36 ms per loop


In [247]:
node_type = nb.deferred_type()

node_spec = [
    ('next', nb.optional(node_type)),
    ('value', nb.int64)
]

@nb.jitclass(node_spec)
class Node:
    def __init__(self, value):
        self.next = None
        self.value = value

node_type.define(Node.class_type.instance_type)


ll_spec = [
    ('head', nb.optional(Node.class_type.instance_type))
]

@nb.jitclass(ll_spec)
class LinkedList:
    
    def __init__(self):
        self.head = None
    
    def push_front(self, value):
        if self.head is None:
            self.head = Node(value)
        else:
            # We replace the head
            new_head = Node(value)
            new_head.next = self.head
            self.head = new_head
    
    def show(self):
        node = self.head
        while node is not None:
            print(node.value)
            node = node.next


lst = LinkedList()
lst.push_front(1)
lst.push_front(2)
lst.push_front(3)
lst.show()

3
2
1


In [248]:
lst = LinkedList()
[lst.push_front(i) for i in range(10000)]

%timeit sum_list(lst)
%timeit sum_list.py_func(lst)

The slowest run took 128.84 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 345 µs per loop
100 loops, best of 3: 3.36 ms per loop


# Tips and Tricks

In [203]:
a = [[0, 1, 2], 
      [3, 4], 
      [5, 6, 7, 8]]

@nb.jit
def sum_sublists(a):
    result = [0]

    for sublist in a:
        result.append(sum(sublist))
    
    return result[1:]

sum_sublists(a)

[3, 7, 26]