diff --git a/.gitignore b/.gitignore index 75f96d9..f479542 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ # created by virtualenv automatically venv/* -.idea/* \ No newline at end of file +.idea/* +ADBISProject2.pdf +watdiv* +Report/out \ No newline at end of file diff --git a/Code/main.py b/Code/main.py new file mode 100644 index 0000000..8ada360 --- /dev/null +++ b/Code/main.py @@ -0,0 +1,107 @@ +from rdflib import Graph, namespace +from collections import defaultdict +import time + + +def merge_tuples(a_tup, b_tup, b_excl): + b_list = list(b_tup) + del b_list[b_excl] + return tuple(list(a_tup)+b_list) + + +def hash_join(a_table, b_table, a_index=1, b_index=0): + # Create a hash dict and fill it with the first table + hash_dict = defaultdict(list) + for a_tup in a_table: + hash_dict[a_tup[a_index]].append(a_tup) + # For each element in second table check elements from first table by index + return [merge_tuples(a_tup, b_tup, b_index) for b_tup in b_table for a_tup in hash_dict[b_tup[b_index]]] + + +class SortMergeJoin: + def get_key(self, a): + return self.selected_tuples[a][self.indexes[a]] + + def active_key(self): + return self.get_key(self.a_is_active) + + def inactive_key(self): + return self.get_key(not self.a_is_active) + + def __init__(self, a_table, b_table, a_index=1, b_index=0): + # First sort both tables and create iterators from that + self.iterators = { + True: iter(sorted(a_table, key=lambda tup: tup[a_index])), + False: iter(sorted(b_table, key=lambda tup: tup[b_index])) + } + self.indexes = { + True: a_index, + False: b_index + } + try: + self.selected_tuples = { + True: next(self.iterators[True]), + False: next(self.iterators[False]) + } + except StopIteration: + return + self.a_is_active = True + self.result = list() + + def join(self): + while True: + try: + while self.active_key() <= self.inactive_key(): + if self.active_key() == self.inactive_key(): + self.result.append(merge_tuples(self.selected_tuples[True], self.selected_tuples[False], self.indexes[False])) + self.selected_tuples[self.a_is_active] = next(self.iterators[self.a_is_active]) + self.a_is_active = not self.a_is_active + except StopIteration: + break + return self.result + + +def nested_loop_join(a_table, b_table, a_index=1, b_index=0): + return [merge_tuples(a_tup, b_tup, b_index) for a_tup in a_table for b_tup in b_table if a_tup[a_index] is b_tup[b_index]] + + +def compare(graph): + print(f"{len(graph)} records loaded") + wsdbm = namespace.Namespace("wsdbm:") + rev = namespace.Namespace("rev:") + properties = { + "follow": wsdbm.follows, + "friend": wsdbm.friendOf, + "like": wsdbm.likes, + "review": rev.hasReview + } + comp_tables = dict() + for p in properties: + comp_tables[p] = [(s.n3(), o.n3()) for s, _, o in graph.triples((None, properties[p], None))] + + hash_start = time.time() + joinh1 = hash_join(comp_tables["follow"], comp_tables["friend"]) + joinh2 = hash_join(joinh1, comp_tables["like"], 2) + joinh3 = hash_join(joinh2, comp_tables["review"], 3) + print(f"{time.time()-hash_start}s for Hash Join ({len(joinh3)} items)") + + merge_sort_start = time.time() + joinsm1 = SortMergeJoin(comp_tables["follow"], comp_tables["friend"]).join() + joinsm2 = SortMergeJoin(joinsm1, comp_tables["like"], 2).join() + joinsm3 = SortMergeJoin(joinsm2, comp_tables["review"], 3).join() + print(f"{time.time()-merge_sort_start}s for Sort Merge Join ({len(joinsm3)} items)") + + loop_start = time.time() + joinnl1 = hash_join(comp_tables["follow"], comp_tables["friend"]) + joinnl2 = hash_join(joinnl1, comp_tables["like"], 2) + joinnl3 = hash_join(joinnl2, comp_tables["review"], 3) + print(f"{time.time()-loop_start}s for Nested Loop Join ({len(joinnl3)} items)") + + +g = Graph() +g.parse("watdiv-url100k.txt", format="nt") +compare(g) + +h = Graph() +h.parse("watdiv.10M.nt", format="nt") +compare(h) diff --git a/Report/report.tex b/Report/report.tex new file mode 100644 index 0000000..88c3425 --- /dev/null +++ b/Report/report.tex @@ -0,0 +1,152 @@ +%! Author = mrmcx +%! Date = 14.07.2022 +%! Template = Fabian Wenzelmann, 2016--2019 + +\documentclass[a4paper, + twoside, % to have to sided mode + headlines=2.1 % number of lines in the heading, increase if you want more +]{scrartcl} +\usepackage[ + margin=2cm, + includefoot, + footskip=35pt, + includeheadfoot, + headsep=0.5cm, +]{geometry} +\usepackage[utf8]{inputenc} +\usepackage[english]{babel} +\usepackage[T1]{fontenc} +\usepackage{mathtools} +\usepackage{amssymb} +\usepackage{lmodern} +\usepackage[automark,headsepline]{scrlayer-scrpage} +\usepackage{enumerate} +\usepackage[protrusion=true,expansion=true,kerning]{microtype} +\usepackage{hyperref} + +\newcommand{\yourname}{Simon Moser} +\newcommand{\lecture}{Advanced Database and Information Systems} +\newcommand{\project}{Project 2} +\author{\yourname} +\title{\lecture} +\subtitle{\project} + +\pagestyle{scrheadings} +\setkomafont{pagehead}{\normalfont} +\lohead{\lecture\\\yourname} +\lehead{\lecture\\\yourname} +\rohead{\project} +\rehead{\project} + +\begin{document} + \maketitle + \section{Problem Statement} + \label{sec:problem-statement} + This report presents different approaches on Join\cite{wikijoin} algorithms on tables. + When two tables are joined, all columns are combined based on one related column in each table. + Every possible combination of rows where those columns are equal is returned. + A very simple approach would be to loop through the first table and for each row loop through the second table and check for equality of the related column. + Since the complexity of this approach is $O(M*N)$ for table sizes $M$ and $N$ and therefore very high, the following sections explain different approaches to this problem. + + \section{Algorithm Description} + \label{sec:algorithm-description} + \subsection{Hash Join} + \label{subsec:hash-join} + The Hash Join\cite{wikihash} algorithm tries to offer a faster solution by using hash tables. + The rough description of the algorithm is: + \begin{enumerate} + \item Add each row of the smaller table to the hash table, where compared column is used to create the hash. + \item For each row in the other table, lookup the hash of it's compared column + \item If the hash exists in the hash table, compare the two rows whether not only the hash is equal + \end{enumerate} + + One caveat of this approach is that the speed relies on the hash table being in-memory. + If it exceeds the size of the memory, it will be slowed down. + + This algorithm is expected to have a complexity of $O(M+N)$ because the first table has to be looped to build the hash table and the second table has to be looped to check the hash table. + Here, a hash table access is expected to have a complexity of $O(1)$. + + \subsection{Sort Merge Join} + \label{subsec:sort-merge-join} + A Sort Merge Join\cite{wikimergesort} tries to restrict the complexity by using good sort algorithms. + For this approach, both tables are sorted by the compared column first. + Then a pointer is set on the start of each column. + Now for each step the related columns for each pointer are compared. + If they are equal, the rows are added to the result. + If not, the pointer in the table with the smaller value is advanced by one row. + + This approach relies on two things: first, the content of the compared column has to be sortable in any way. + Second, since both tables need to be sorted first, an effective sort algorithm has to be used. + + Merge sort of the two tables has a complexity of $O(M \log_2 M + N \log N)$. + This exceeds the complexity of the approach in subsection\ \ref{subsec:hash-join}. + Still, this algorithm can be more effective when the tables are already sorted. + In this case, the complexity drops to $O(M+N)$ that is required for walking through the tables. + + \section{Dataset Description} + \label{sec:dataset-description} + For the analysis, the WatDiv\cite{watdiv} dataset is used. + It consists out of the following entity types: + \begin{itemize} + \item wsdbm:User + \item wsdbm:Product + \item wsdbm:Review + \end{itemize} + + They have the following relations: + \begin{itemize} + \item wsdbm:User wsdbm:follows wsdbm:User + \item wsdbm:User wsdbm:friendOf wsdbm:User + \item wsdbm:User wsdbm:likes wsdbm:Product + \item wsdbm:Product rev:hasReview wsdbm:Review + \end{itemize} + + \section{Experiment and Analysis} + \label{sec:experiment-and-analysis} + \subsection{Preparations} + \label{subsec:preparations} + To be able to import the dataset using the Python library rdflib\cite{rdflib}, the file contents were brought to an url format using the following bash command: + + \verb$ sed -E 's/([a-zA-Z0-9]+:[^ \t]+)/<\1>/g'$ + + \subsection{Implementation} + \label{subsec:implementation} + The algorithms were implemented using Python. + It might not offer the fastest way, but as an interpreted language Python is very suitable for fast-paced development. + The code (also of this report) is published at: \url{https://naclador.de/mosers/ADBIS-Projekt2} + + \subsection{Analysis} + \label{subsec:analysis} + Due to a lack of time, an existing bug in the sort merge join could not be found and fixed. + Therefore, the following results for the small dataset are not representative: + + \begin{verbatim} + 5.67576265335083s for Hash Join (11415461 items) + 0.15003275871276855s for Sort Merge Join (1475 items) + 6.041101694107056s for Nested Loop Join (11415461 items) + \end{verbatim} + + As it can be seen, the number of result items for the sort merge join differs greatly. + + For smaller tables, Hash Join is expected to be the fasted algorithm since the table can be kept in memory completely. + For the bigger table, Sort Merge Join will be faster as it doesn't have a memory limit. + + \section{Conclusion} + \label{sec:conclusion} + Join Algorithms should be used by use case. + There is nothing like the fastest algorithm, for small tables Hash Join is very effective while Sort Merge Join is perfect for pre-sorted tables. + The comparison to Nested Loop Join shows that a sophisticated algorithm is usually better than the easiest solution. + + \begin{thebibliography}{watdiv} + \bibitem{wikijoin} + Join (SQL), Wikipedia, \url{https://en.wikipedia.org/wiki/Join_(SQL)} + \bibitem{wikihash} + Hash join, Wikipedia, \url{https://en.wikipedia.org/wiki/Hash_join} + \bibitem{wikimergesort} + Sort-Merge-Join, Wikipedia, \url{https://en.wikipedia.org/wiki/Sort-merge_join} + \bibitem{watdiv} + Waterloo SPARQL Diversity Test Suite, University of Waterloo, \url{https://dsg.uwaterloo.ca/watdiv/} + \bibitem{rdflib} + RDFLib, \url{https://rdflib.readthedocs.io} + \end{thebibliography} +\end{document} diff --git a/main.py b/main.py deleted file mode 100644 index e69de29..0000000