Report added

vor 4 Jahren · d22178dfab
Commit d22178dfab
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,6 @@
 # created by virtualenv automatically
 venv/*
 .idea/*
+ADBISProject2.pdf
+watdiv*
+Report/out
--- a/Code/main.py
+++ b/Code/main.py
@ -0,0 +1,107 @@
+from rdflib import Graph, namespace
+from collections import defaultdict
+import time
+
+
+def merge_tuples(a_tup, b_tup, b_excl):
+    b_list = list(b_tup)
+    del b_list[b_excl]
+    return tuple(list(a_tup)+b_list)
+
+
+def hash_join(a_table, b_table, a_index=1, b_index=0):
+    # Create a hash dict and fill it with the first table
+    hash_dict = defaultdict(list)
+    for a_tup in a_table:
+        hash_dict[a_tup[a_index]].append(a_tup)
+    # For each element in second table check elements from first table by index
+    return [merge_tuples(a_tup, b_tup, b_index) for b_tup in b_table for a_tup in hash_dict[b_tup[b_index]]]
+
+
+class SortMergeJoin:
+    def get_key(self, a):
+        return self.selected_tuples[a][self.indexes[a]]
+
+    def active_key(self):
+        return self.get_key(self.a_is_active)
+
+    def inactive_key(self):
+        return self.get_key(not self.a_is_active)
+
+    def __init__(self, a_table, b_table, a_index=1, b_index=0):
+        # First sort both tables and create iterators from that
+        self.iterators = {
+            True: iter(sorted(a_table, key=lambda tup: tup[a_index])),
+            False: iter(sorted(b_table, key=lambda tup: tup[b_index]))
+        }
+        self.indexes = {
+            True: a_index,
+            False: b_index
+        }
+        try:
+            self.selected_tuples = {
+                True: next(self.iterators[True]),
+                False: next(self.iterators[False])
+            }
+        except StopIteration:
+            return
+        self.a_is_active = True
+        self.result = list()
+
+    def join(self):
+        while True:
+            try:
+                while self.active_key() <= self.inactive_key():
+                    if self.active_key() == self.inactive_key():
+                        self.result.append(merge_tuples(self.selected_tuples[True], self.selected_tuples[False], self.indexes[False]))
+                    self.selected_tuples[self.a_is_active] = next(self.iterators[self.a_is_active])
+                self.a_is_active = not self.a_is_active
+            except StopIteration:
+                break
+        return self.result
+
+
+def nested_loop_join(a_table, b_table, a_index=1, b_index=0):
+    return [merge_tuples(a_tup, b_tup, b_index) for a_tup in a_table for b_tup in b_table if a_tup[a_index] is b_tup[b_index]]
+
+
+def compare(graph):
+    print(f"{len(graph)} records loaded")
+    wsdbm = namespace.Namespace("wsdbm:")
+    rev = namespace.Namespace("rev:")
+    properties = {
+        "follow": wsdbm.follows,
+        "friend": wsdbm.friendOf,
+        "like": wsdbm.likes,
+        "review": rev.hasReview
+    }
+    comp_tables = dict()
+    for p in properties:
+        comp_tables[p] = [(s.n3(), o.n3()) for s, _, o in graph.triples((None, properties[p], None))]
+
+    hash_start = time.time()
+    joinh1 = hash_join(comp_tables["follow"], comp_tables["friend"])
+    joinh2 = hash_join(joinh1, comp_tables["like"], 2)
+    joinh3 = hash_join(joinh2, comp_tables["review"], 3)
+    print(f"{time.time()-hash_start}s for Hash Join ({len(joinh3)} items)")
+
+    merge_sort_start = time.time()
+    joinsm1 = SortMergeJoin(comp_tables["follow"], comp_tables["friend"]).join()
+    joinsm2 = SortMergeJoin(joinsm1, comp_tables["like"], 2).join()
+    joinsm3 = SortMergeJoin(joinsm2, comp_tables["review"], 3).join()
+    print(f"{time.time()-merge_sort_start}s for Sort Merge Join ({len(joinsm3)} items)")
+
+    loop_start = time.time()
+    joinnl1 = hash_join(comp_tables["follow"], comp_tables["friend"])
+    joinnl2 = hash_join(joinnl1, comp_tables["like"], 2)
+    joinnl3 = hash_join(joinnl2, comp_tables["review"], 3)
+    print(f"{time.time()-loop_start}s for Nested Loop Join ({len(joinnl3)} items)")
+
+
+g = Graph()
+g.parse("watdiv-url100k.txt", format="nt")
+compare(g)
+
+h = Graph()
+h.parse("watdiv.10M.nt", format="nt")
+compare(h)
--- a/Report/report.tex
+++ b/Report/report.tex
@ -0,0 +1,152 @@
+%! Author = mrmcx
+%! Date = 14.07.2022
+%! Template = Fabian Wenzelmann, 2016--2019
+
+\documentclass[a4paper,
+    twoside, % to have to sided mode
+    headlines=2.1 % number of lines in the heading, increase if you want more
+]{scrartcl}
+\usepackage[
+    margin=2cm,
+    includefoot,
+    footskip=35pt,
+    includeheadfoot,
+    headsep=0.5cm,
+]{geometry}
+\usepackage[utf8]{inputenc}
+\usepackage[english]{babel}
+\usepackage[T1]{fontenc}
+\usepackage{mathtools}
+\usepackage{amssymb}
+\usepackage{lmodern}
+\usepackage[automark,headsepline]{scrlayer-scrpage}
+\usepackage{enumerate}
+\usepackage[protrusion=true,expansion=true,kerning]{microtype}
+\usepackage{hyperref}
+
+\newcommand{\yourname}{Simon Moser}
+\newcommand{\lecture}{Advanced Database and Information Systems}
+\newcommand{\project}{Project 2}
+\author{\yourname}
+\title{\lecture}
+\subtitle{\project}
+
+\pagestyle{scrheadings}
+\setkomafont{pagehead}{\normalfont}
+\lohead{\lecture\\\yourname}
+\lehead{\lecture\\\yourname}
+\rohead{\project}
+\rehead{\project}
+
+\begin{document}
+    \maketitle
+    \section{Problem Statement}
+    \label{sec:problem-statement}
+    This report presents different approaches on Join\cite{wikijoin} algorithms on tables.
+    When two tables are joined, all columns are combined based on one related column in each table.
+    Every possible combination of rows where those columns are equal is returned.
+    A very simple approach would be to loop through the first table and for each row loop through the second table and check for equality of the related column.
+    Since the complexity of this approach is $O(M*N)$ for table sizes $M$ and $N$ and therefore very high, the following sections explain different approaches to this problem.
+
+    \section{Algorithm Description}
+    \label{sec:algorithm-description}
+    \subsection{Hash Join}
+    \label{subsec:hash-join}
+    The Hash Join\cite{wikihash} algorithm tries to offer a faster solution by using hash tables.
+    The rough description of the algorithm is:
+    \begin{enumerate}
+        \item Add each row of the smaller table to the hash table, where compared column is used to create the hash.
+        \item For each row in the other table, lookup the hash of it's compared column
+        \item If the hash exists in the hash table, compare the two rows whether not only the hash is equal
+    \end{enumerate}
+
+    One caveat of this approach is that the speed relies on the hash table being in-memory.
+    If it exceeds the size of the memory, it will be slowed down.
+
+    This algorithm is expected to have a complexity of $O(M+N)$ because the first table has to be looped to build the hash table and the second table has to be looped to check the hash table.
+    Here, a hash table access is expected to have a complexity of $O(1)$.
+
+    \subsection{Sort Merge Join}
+    \label{subsec:sort-merge-join}
+    A Sort Merge Join\cite{wikimergesort} tries to restrict the complexity by using good sort algorithms.
+    For this approach, both tables are sorted by the compared column first.
+    Then a pointer is set on the start of each column.
+    Now for each step the related columns for each pointer are compared.
+    If they are equal, the rows are added to the result.
+    If not, the pointer in the table with the smaller value is advanced by one row.
+
+    This approach relies on two things: first, the content of the compared column has to be sortable in any way.
+    Second, since both tables need to be sorted first, an effective sort algorithm has to be used.
+
+    Merge sort of the two tables has a complexity of $O(M \log_2 M + N \log N)$.
+    This exceeds the complexity of the approach in subsection\ \ref{subsec:hash-join}.
+    Still, this algorithm can be more effective when the tables are already sorted.
+    In this case, the complexity drops to $O(M+N)$ that is required for walking through the tables.
+
+    \section{Dataset Description}
+    \label{sec:dataset-description}
+    For the analysis, the WatDiv\cite{watdiv} dataset is used.
+    It consists out of the following entity types:
+    \begin{itemize}
+        \item wsdbm:User
+        \item wsdbm:Product
+        \item wsdbm:Review
+    \end{itemize}
+
+    They have the following relations:
+    \begin{itemize}
+        \item wsdbm:User wsdbm:follows wsdbm:User
+        \item wsdbm:User wsdbm:friendOf wsdbm:User
+        \item wsdbm:User wsdbm:likes wsdbm:Product
+        \item wsdbm:Product rev:hasReview wsdbm:Review
+    \end{itemize}
+
+    \section{Experiment and Analysis}
+    \label{sec:experiment-and-analysis}
+    \subsection{Preparations}
+    \label{subsec:preparations}
+    To be able to import the dataset using the Python library rdflib\cite{rdflib}, the file contents were brought to an url format using the following bash command:
+    
+    \verb$ sed -E 's/([a-zA-Z0-9]+:[^ \t]+)/<\1>/g'$
+
+    \subsection{Implementation}
+    \label{subsec:implementation}
+    The algorithms were implemented using Python.
+    It might not offer the fastest way, but as an interpreted language Python is very suitable for fast-paced development.
+    The code (also of this report) is published at: \url{https://naclador.de/mosers/ADBIS-Projekt2}
+    
+    \subsection{Analysis}
+    \label{subsec:analysis}
+    Due to a lack of time, an existing bug in the sort merge join could not be found and fixed.
+    Therefore, the following results for the small dataset are not representative:
+    
+    \begin{verbatim}
+        5.67576265335083s for Hash Join (11415461 items)
+        0.15003275871276855s for Sort Merge Join (1475 items)
+        6.041101694107056s for Nested Loop Join (11415461 items)
+    \end{verbatim}
+
+    As it can be seen, the number of result items for the sort merge join differs greatly.
+
+    For smaller tables, Hash Join is expected to be the fasted algorithm since the table can be kept in memory completely.
+    For the bigger table, Sort Merge Join will be faster as it doesn't have a memory limit.
+
+    \section{Conclusion}
+    \label{sec:conclusion}
+    Join Algorithms should be used by use case.
+    There is nothing like the fastest algorithm, for small tables Hash Join is very effective while Sort Merge Join is perfect for pre-sorted tables.
+    The comparison to Nested Loop Join shows that a sophisticated algorithm is usually better than the easiest solution.
+
+    \begin{thebibliography}{watdiv}
+        \bibitem{wikijoin}
+        Join (SQL), Wikipedia, \url{https://en.wikipedia.org/wiki/Join_(SQL)}
+        \bibitem{wikihash}
+        Hash join, Wikipedia, \url{https://en.wikipedia.org/wiki/Hash_join}
+        \bibitem{wikimergesort}
+        Sort-Merge-Join, Wikipedia, \url{https://en.wikipedia.org/wiki/Sort-merge_join}
+        \bibitem{watdiv}
+        Waterloo SPARQL Diversity Test Suite, University of Waterloo, \url{https://dsg.uwaterloo.ca/watdiv/}
+        \bibitem{rdflib}
+        RDFLib, \url{https://rdflib.readthedocs.io}
+    \end{thebibliography}
+\end{document}
--- a/main.py
+++ b/main.py