Report added

vor 3 Jahren · d22178dfab
Commit d22178dfab
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,6 @@
 # created by virtualenv automatically
 venv/*
 .idea/*
 ADBISProject2.pdf
 watdiv*
 Report/out
--- a/Code/main.py
+++ b/Code/main.py
@ -0,0 +1,107 @@
 from rdflib import Graph, namespace
 from collections import defaultdict
 import time
 def merge_tuples(a_tup, b_tup, b_excl):
    b_list = list(b_tup)
    del b_list[b_excl]
    return tuple(list(a_tup)+b_list)
 def hash_join(a_table, b_table, a_index=1, b_index=0):
    # Create a hash dict and fill it with the first table
    hash_dict = defaultdict(list)
    for a_tup in a_table:
        hash_dict[a_tup[a_index]].append(a_tup)
    # For each element in second table check elements from first table by index
    return [merge_tuples(a_tup, b_tup, b_index) for b_tup in b_table for a_tup in hash_dict[b_tup[b_index]]]
 class SortMergeJoin:
    def get_key(self, a):
        return self.selected_tuples[a][self.indexes[a]]
    def active_key(self):
        return self.get_key(self.a_is_active)
    def inactive_key(self):
        return self.get_key(not self.a_is_active)
    def __init__(self, a_table, b_table, a_index=1, b_index=0):
        # First sort both tables and create iterators from that
        self.iterators = {
            True: iter(sorted(a_table, key=lambda tup: tup[a_index])),
            False: iter(sorted(b_table, key=lambda tup: tup[b_index]))
        }
        self.indexes = {
            True: a_index,
            False: b_index
        }
        try:
            self.selected_tuples = {
                True: next(self.iterators[True]),
                False: next(self.iterators[False])
            }
        except StopIteration:
            return
        self.a_is_active = True
        self.result = list()
    def join(self):
        while True:
            try:
                while self.active_key() <= self.inactive_key():
                    if self.active_key() == self.inactive_key():
                        self.result.append(merge_tuples(self.selected_tuples[True], self.selected_tuples[False], self.indexes[False]))
                    self.selected_tuples[self.a_is_active] = next(self.iterators[self.a_is_active])
                self.a_is_active = not self.a_is_active
            except StopIteration:
                break
        return self.result
 def nested_loop_join(a_table, b_table, a_index=1, b_index=0):
    return [merge_tuples(a_tup, b_tup, b_index) for a_tup in a_table for b_tup in b_table if a_tup[a_index] is b_tup[b_index]]
 def compare(graph):
    print(f"{len(graph)} records loaded")
    wsdbm = namespace.Namespace("wsdbm:")
    rev = namespace.Namespace("rev:")
    properties = {
        "follow": wsdbm.follows,
        "friend": wsdbm.friendOf,
        "like": wsdbm.likes,
        "review": rev.hasReview
    }
    comp_tables = dict()
    for p in properties:
        comp_tables[p] = [(s.n3(), o.n3()) for s, _, o in graph.triples((None, properties[p], None))]
    hash_start = time.time()
    joinh1 = hash_join(comp_tables["follow"], comp_tables["friend"])
    joinh2 = hash_join(joinh1, comp_tables["like"], 2)
    joinh3 = hash_join(joinh2, comp_tables["review"], 3)
    print(f"{time.time()-hash_start}s for Hash Join ({len(joinh3)} items)")
    merge_sort_start = time.time()
    joinsm1 = SortMergeJoin(comp_tables["follow"], comp_tables["friend"]).join()
    joinsm2 = SortMergeJoin(joinsm1, comp_tables["like"], 2).join()
    joinsm3 = SortMergeJoin(joinsm2, comp_tables["review"], 3).join()
    print(f"{time.time()-merge_sort_start}s for Sort Merge Join ({len(joinsm3)} items)")
    loop_start = time.time()
    joinnl1 = hash_join(comp_tables["follow"], comp_tables["friend"])
    joinnl2 = hash_join(joinnl1, comp_tables["like"], 2)
    joinnl3 = hash_join(joinnl2, comp_tables["review"], 3)
    print(f"{time.time()-loop_start}s for Nested Loop Join ({len(joinnl3)} items)")
 g = Graph()
 g.parse("watdiv-url100k.txt", format="nt")
 compare(g)
 h = Graph()
 h.parse("watdiv.10M.nt", format="nt")
 compare(h)
--- a/Report/report.tex
+++ b/Report/report.tex
@ -0,0 +1,152 @@
 %! Author = mrmcx
 %! Date = 14.07.2022
 %! Template = Fabian Wenzelmann, 2016--2019
 \documentclass[a4paper,
    twoside, % to have to sided mode
    headlines=2.1 % number of lines in the heading, increase if you want more
 ]{scrartcl}
 \usepackage[
    margin=2cm,
    includefoot,
    footskip=35pt,
    includeheadfoot,
    headsep=0.5cm,
 ]{geometry}
 \usepackage[utf8]{inputenc}
 \usepackage[english]{babel}
 \usepackage[T1]{fontenc}
 \usepackage{mathtools}
 \usepackage{amssymb}
 \usepackage{lmodern}
 \usepackage[automark,headsepline]{scrlayer-scrpage}
 \usepackage{enumerate}
 \usepackage[protrusion=true,expansion=true,kerning]{microtype}
 \usepackage{hyperref}
 \newcommand{\yourname}{Simon Moser}
 \newcommand{\lecture}{Advanced Database and Information Systems}
 \newcommand{\project}{Project 2}
 \author{\yourname}
 \title{\lecture}
 \subtitle{\project}
 \pagestyle{scrheadings}
 \setkomafont{pagehead}{\normalfont}
 \lohead{\lecture\\\yourname}
 \lehead{\lecture\\\yourname}
 \rohead{\project}
 \rehead{\project}
 \begin{document}
    \maketitle
    \section{Problem Statement}
    \label{sec:problem-statement}
    This report presents different approaches on Join\cite{wikijoin} algorithms on tables.
    When two tables are joined, all columns are combined based on one related column in each table.
    Every possible combination of rows where those columns are equal is returned.
    A very simple approach would be to loop through the first table and for each row loop through the second table and check for equality of the related column.
    Since the complexity of this approach is $O(M*N)$ for table sizes $M$ and $N$ and therefore very high, the following sections explain different approaches to this problem.
    \section{Algorithm Description}
    \label{sec:algorithm-description}
    \subsection{Hash Join}
    \label{subsec:hash-join}
    The Hash Join\cite{wikihash} algorithm tries to offer a faster solution by using hash tables.
    The rough description of the algorithm is:
    \begin{enumerate}
        \item Add each row of the smaller table to the hash table, where compared column is used to create the hash.
        \item For each row in the other table, lookup the hash of it's compared column
        \item If the hash exists in the hash table, compare the two rows whether not only the hash is equal
    \end{enumerate}
    One caveat of this approach is that the speed relies on the hash table being in-memory.
    If it exceeds the size of the memory, it will be slowed down.
    This algorithm is expected to have a complexity of $O(M+N)$ because the first table has to be looped to build the hash table and the second table has to be looped to check the hash table.
    Here, a hash table access is expected to have a complexity of $O(1)$.
    \subsection{Sort Merge Join}
    \label{subsec:sort-merge-join}
    A Sort Merge Join\cite{wikimergesort} tries to restrict the complexity by using good sort algorithms.
    For this approach, both tables are sorted by the compared column first.
    Then a pointer is set on the start of each column.
    Now for each step the related columns for each pointer are compared.
    If they are equal, the rows are added to the result.
    If not, the pointer in the table with the smaller value is advanced by one row.
    This approach relies on two things: first, the content of the compared column has to be sortable in any way.
    Second, since both tables need to be sorted first, an effective sort algorithm has to be used.
    Merge sort of the two tables has a complexity of $O(M \log_2 M + N \log N)$.
    This exceeds the complexity of the approach in subsection\ \ref{subsec:hash-join}.
    Still, this algorithm can be more effective when the tables are already sorted.
    In this case, the complexity drops to $O(M+N)$ that is required for walking through the tables.
    \section{Dataset Description}
    \label{sec:dataset-description}
    For the analysis, the WatDiv\cite{watdiv} dataset is used.
    It consists out of the following entity types:
    \begin{itemize}
        \item wsdbm:User
        \item wsdbm:Product
        \item wsdbm:Review
    \end{itemize}
    They have the following relations:
    \begin{itemize}
        \item wsdbm:User wsdbm:follows wsdbm:User
        \item wsdbm:User wsdbm:friendOf wsdbm:User
        \item wsdbm:User wsdbm:likes wsdbm:Product
        \item wsdbm:Product rev:hasReview wsdbm:Review
    \end{itemize}
    \section{Experiment and Analysis}
    \label{sec:experiment-and-analysis}
    \subsection{Preparations}
    \label{subsec:preparations}
    To be able to import the dataset using the Python library rdflib\cite{rdflib}, the file contents were brought to an url format using the following bash command:
    \verb$ sed -E 's/([a-zA-Z0-9]+:[^ \t]+)/<\1>/g'$
    \subsection{Implementation}
    \label{subsec:implementation}
    The algorithms were implemented using Python.
    It might not offer the fastest way, but as an interpreted language Python is very suitable for fast-paced development.
    The code (also of this report) is published at: \url{https://naclador.de/mosers/ADBIS-Projekt2}
    \subsection{Analysis}
    \label{subsec:analysis}
    Due to a lack of time, an existing bug in the sort merge join could not be found and fixed.
    Therefore, the following results for the small dataset are not representative:
    \begin{verbatim}
        5.67576265335083s for Hash Join (11415461 items)
        0.15003275871276855s for Sort Merge Join (1475 items)
        6.041101694107056s for Nested Loop Join (11415461 items)
    \end{verbatim}
    As it can be seen, the number of result items for the sort merge join differs greatly.
    For smaller tables, Hash Join is expected to be the fasted algorithm since the table can be kept in memory completely.
    For the bigger table, Sort Merge Join will be faster as it doesn't have a memory limit.
    \section{Conclusion}
    \label{sec:conclusion}
    Join Algorithms should be used by use case.
    There is nothing like the fastest algorithm, for small tables Hash Join is very effective while Sort Merge Join is perfect for pre-sorted tables.
    The comparison to Nested Loop Join shows that a sophisticated algorithm is usually better than the easiest solution.
    \begin{thebibliography}{watdiv}
        \bibitem{wikijoin}
        Join (SQL), Wikipedia, \url{https://en.wikipedia.org/wiki/Join_(SQL)}
        \bibitem{wikihash}
        Hash join, Wikipedia, \url{https://en.wikipedia.org/wiki/Hash_join}
        \bibitem{wikimergesort}
        Sort-Merge-Join, Wikipedia, \url{https://en.wikipedia.org/wiki/Sort-merge_join}
        \bibitem{watdiv}
        Waterloo SPARQL Diversity Test Suite, University of Waterloo, \url{https://dsg.uwaterloo.ca/watdiv/}
        \bibitem{rdflib}
        RDFLib, \url{https://rdflib.readthedocs.io}
    \end{thebibliography}
 \end{document}
--- a/main.py
+++ b/main.py