Comparing data structures

Data structures perform differently for the insertion and selection of data.

Experiment

A simple experiment has been set up to gather experimental data.

The structures have been implemented manually instead of using the standard types.

from base_structure import *
from data import *

class ArrayStructure(BaseStructure):
    def __init__(self, max:int) -> None:
        self.records=[None]*max
        self.last_index=0

    def add(self, record:Data) -> None:
        self.records[self.last_index] = record
        self.last_index+=1

    def find(self, key:str) -> str:
        for i in range(0,len(self.records)):
            if self.records[i].key == key:
                return self.records[i].value
        return None
from base_structure import *
from data import *

class BTree(BaseStructure):
    class Node:
        def __init__(self, value:Data) -> None:
            self.left = None
            self.right = None
            self.value = value

    def __init__(self) -> None:
        self.head = None

    def add(self, record:Data) -> None:
        node = self.Node(record)
        if(self.head is None):
            self.head = node
        else:
            self.__add(node, self.head)

    def __add(self, node:Node, head:Node) -> None:
            if head.value.key > node.value.key:
                if head.right is None:
                    head.right = node
                else:
                    self.__add(node, head.right)
            else:
                if head.left is None:
                    head.left = node
                else:
                    self.__add(node, head.left)

    def find(self, key:str) -> str:
        return self.__find(key, self.head)

    def __find(self, key:str, head:Node) -> str:
        if head is None:
            return None
        elif head.value.key == key:
            return head.value.value
        elif head.value.key < key:
            return self.__find(key, head.left)
        else:
            return self.__find(key, head.right)
from base_structure import *
from data import *

class LinkedList(BaseStructure):
    class Entry:
        def __init__(self, record:Data) -> None:
            self.record = record
            self.next = None

    def __init__(self) -> None:
        self.head = None
        self.tail = None

    def add(self, record:Data) -> None:
        entry = self.Entry(record)
        if self.head is None:
            self.head = entry
            self.tail = self.head
        else:
            self.tail.next = entry
            self.tail = self.tail.next

    def find(self, key:str) -> str:
        cur = self.head
        while cur is not None and cur.record.key != key:
            cur = cur.next
        return cur.record.value if cur is not None else None
from base_structure import *
from linked_list import *
from data import *

class HashMap(BaseStructure):

    def __init__(self) -> None:
        self.array_size = 31
        self.array = [None]*self.array_size

    def add(self, data:Data):
        hash = self.__hash(data.key)
        if self.array[hash] is None:
            self.array[hash] = LinkedList()
        self.array[hash].add(data)

    def find(self, key:str) -> str:
        hash = self.__hash(key)
        if self.array[hash] is None:
            return None
        else:
            return self.array[hash].find(key)

    def __hash(self, string:str) -> int:
        value = 0
        for element in range(0, len(string)):
            value+=ord(string[element])
        return value%self.array_size

base class and data:

from abc import abstractmethod, ABCMeta
from data import *

class BaseStructure(metaclass=ABCMeta):

    @abstractmethod
    def add(self, record:Data) -> None:
        '''
        Init the data with n records and stores a 'sample' to be used in find
        '''
        pass

    @abstractmethod
    def find(self, key:str) -> str:
        '''
        Find the 'sample' created in the init
        '''
        pass
class Data:
    def __init__(self, key:str, value:str) -> None:
        self.key = key
        self.value = value

The experiment consist in inserting unordered key-value pairs into various structures. One random key is then used to measure the performances to return a value. The experiment also checks the correctness of the returned value. No failures have been detected.

from hwcounter import count, count_end
import numpy as np

from data import *
from array_structure import *
from btree import *
from hash_map import *

#
# prepare data

tot_elements = 10000
key_to_find = 5000
expected_key_to_find = "key{i}".format(i=key_to_find)
expected_value_to_find = "value{i}".format(i=key_to_find)
data = [Data("key{i}".format(i=i),"value{i}".format(i=i)) for i in range(1,tot_elements)]
np.random.shuffle(data)

#
# for the stats

performances = []

class Performance:
    def __init__(self, name:str, init:int, find:int, fail:bool) -> None:
        self.name = name
        self.init = init
        self.find = find
        self.fail = fail

structs = [ArrayStructure(tot_elements), LinkedList(), BTree(), HashMap()]

#
# actual test

for struct in structs:
    fail = False
    start = count()
    for d in data:
        struct.add(d)
    init = count_end() - start

    start = count()
    if struct.find(expected_key_to_find) != expected_value_to_find:
        fail = True
    find = count_end() - start

    performances.append(Performance(struct.__class__.__name__, init,find, fail))

#
# output

print('{:15} {:15} {:15} {}'.format('type','time to insert','time to find', 'fail'))
for performance in performances:
    print('{name:15} {init:15} {find:15} {fail}'.format(
        name = performance.name,
        init = "{:,}".format(performance.init),
        find = "{:,}".format(performance.find),
        fail = performance.fail))

Result

The following is an example of the result obtained from the experiment. Multiple executions are consistent with this report.

type time to insert time to find fail
ArrayStructure 43,993,376 9,033,290 False
LinkedList 145,084,444 9,202,950 False
BTree 796,571,810 669,562 False
HashMap 233,445,188 258,856 False

Time is expressed in CPU cycles.

Review of the results

As expected Arrays are by far the most efficient structure to store data, but they are also very inefficient when it is time to find a specific value.

Overall, linked lists perform worse than Arrays in insertion and comparably in selection.

Btree is the most complex structure and they have an expensive insertion process, but they are very efficient in retrieving data.

Hash maps effectively optimize linked lists resulting in slightly less efficient insertion but the fastest structure to retrieve data from.

Key takeaways

  • arrays should be preferred every time there is no need to search for a specific entry.
  • linked lists should be used as a last resort to replace arrays when it is not known the number of elements that they should contain.
  • B-trees are recommended when it is important to give priority to searching for an element.
  • Hash maps are a balanced choice when it is possible to define an index for the data.

Other structures

Queues and stack do not fit the use case of this experiment, but, given their nature, they would perform like a linked list.