Report added
Ursprung
c5938d23a2
Commit
d22178dfab
@ -1,3 +1,6 @@
|
||||
# created by virtualenv automatically
|
||||
venv/*
|
||||
.idea/*
|
||||
ADBISProject2.pdf
|
||||
watdiv*
|
||||
Report/out
|
@ -0,0 +1,107 @@
|
||||
from rdflib import Graph, namespace
|
||||
from collections import defaultdict
|
||||
import time
|
||||
|
||||
|
||||
def merge_tuples(a_tup, b_tup, b_excl):
|
||||
b_list = list(b_tup)
|
||||
del b_list[b_excl]
|
||||
return tuple(list(a_tup)+b_list)
|
||||
|
||||
|
||||
def hash_join(a_table, b_table, a_index=1, b_index=0):
|
||||
# Create a hash dict and fill it with the first table
|
||||
hash_dict = defaultdict(list)
|
||||
for a_tup in a_table:
|
||||
hash_dict[a_tup[a_index]].append(a_tup)
|
||||
# For each element in second table check elements from first table by index
|
||||
return [merge_tuples(a_tup, b_tup, b_index) for b_tup in b_table for a_tup in hash_dict[b_tup[b_index]]]
|
||||
|
||||
|
||||
class SortMergeJoin:
|
||||
def get_key(self, a):
|
||||
return self.selected_tuples[a][self.indexes[a]]
|
||||
|
||||
def active_key(self):
|
||||
return self.get_key(self.a_is_active)
|
||||
|
||||
def inactive_key(self):
|
||||
return self.get_key(not self.a_is_active)
|
||||
|
||||
def __init__(self, a_table, b_table, a_index=1, b_index=0):
|
||||
# First sort both tables and create iterators from that
|
||||
self.iterators = {
|
||||
True: iter(sorted(a_table, key=lambda tup: tup[a_index])),
|
||||
False: iter(sorted(b_table, key=lambda tup: tup[b_index]))
|
||||
}
|
||||
self.indexes = {
|
||||
True: a_index,
|
||||
False: b_index
|
||||
}
|
||||
try:
|
||||
self.selected_tuples = {
|
||||
True: next(self.iterators[True]),
|
||||
False: next(self.iterators[False])
|
||||
}
|
||||
except StopIteration:
|
||||
return
|
||||
self.a_is_active = True
|
||||
self.result = list()
|
||||
|
||||
def join(self):
|
||||
while True:
|
||||
try:
|
||||
while self.active_key() <= self.inactive_key():
|
||||
if self.active_key() == self.inactive_key():
|
||||
self.result.append(merge_tuples(self.selected_tuples[True], self.selected_tuples[False], self.indexes[False]))
|
||||
self.selected_tuples[self.a_is_active] = next(self.iterators[self.a_is_active])
|
||||
self.a_is_active = not self.a_is_active
|
||||
except StopIteration:
|
||||
break
|
||||
return self.result
|
||||
|
||||
|
||||
def nested_loop_join(a_table, b_table, a_index=1, b_index=0):
|
||||
return [merge_tuples(a_tup, b_tup, b_index) for a_tup in a_table for b_tup in b_table if a_tup[a_index] is b_tup[b_index]]
|
||||
|
||||
|
||||
def compare(graph):
|
||||
print(f"{len(graph)} records loaded")
|
||||
wsdbm = namespace.Namespace("wsdbm:")
|
||||
rev = namespace.Namespace("rev:")
|
||||
properties = {
|
||||
"follow": wsdbm.follows,
|
||||
"friend": wsdbm.friendOf,
|
||||
"like": wsdbm.likes,
|
||||
"review": rev.hasReview
|
||||
}
|
||||
comp_tables = dict()
|
||||
for p in properties:
|
||||
comp_tables[p] = [(s.n3(), o.n3()) for s, _, o in graph.triples((None, properties[p], None))]
|
||||
|
||||
hash_start = time.time()
|
||||
joinh1 = hash_join(comp_tables["follow"], comp_tables["friend"])
|
||||
joinh2 = hash_join(joinh1, comp_tables["like"], 2)
|
||||
joinh3 = hash_join(joinh2, comp_tables["review"], 3)
|
||||
print(f"{time.time()-hash_start}s for Hash Join ({len(joinh3)} items)")
|
||||
|
||||
merge_sort_start = time.time()
|
||||
joinsm1 = SortMergeJoin(comp_tables["follow"], comp_tables["friend"]).join()
|
||||
joinsm2 = SortMergeJoin(joinsm1, comp_tables["like"], 2).join()
|
||||
joinsm3 = SortMergeJoin(joinsm2, comp_tables["review"], 3).join()
|
||||
print(f"{time.time()-merge_sort_start}s for Sort Merge Join ({len(joinsm3)} items)")
|
||||
|
||||
loop_start = time.time()
|
||||
joinnl1 = hash_join(comp_tables["follow"], comp_tables["friend"])
|
||||
joinnl2 = hash_join(joinnl1, comp_tables["like"], 2)
|
||||
joinnl3 = hash_join(joinnl2, comp_tables["review"], 3)
|
||||
print(f"{time.time()-loop_start}s for Nested Loop Join ({len(joinnl3)} items)")
|
||||
|
||||
|
||||
g = Graph()
|
||||
g.parse("watdiv-url100k.txt", format="nt")
|
||||
compare(g)
|
||||
|
||||
h = Graph()
|
||||
h.parse("watdiv.10M.nt", format="nt")
|
||||
compare(h)
|
@ -0,0 +1,152 @@
|
||||
%! Author = mrmcx
|
||||
%! Date = 14.07.2022
|
||||
%! Template = Fabian Wenzelmann, 2016--2019
|
||||
|
||||
\documentclass[a4paper,
|
||||
twoside, % to have to sided mode
|
||||
headlines=2.1 % number of lines in the heading, increase if you want more
|
||||
]{scrartcl}
|
||||
\usepackage[
|
||||
margin=2cm,
|
||||
includefoot,
|
||||
footskip=35pt,
|
||||
includeheadfoot,
|
||||
headsep=0.5cm,
|
||||
]{geometry}
|
||||
\usepackage[utf8]{inputenc}
|
||||
\usepackage[english]{babel}
|
||||
\usepackage[T1]{fontenc}
|
||||
\usepackage{mathtools}
|
||||
\usepackage{amssymb}
|
||||
\usepackage{lmodern}
|
||||
\usepackage[automark,headsepline]{scrlayer-scrpage}
|
||||
\usepackage{enumerate}
|
||||
\usepackage[protrusion=true,expansion=true,kerning]{microtype}
|
||||
\usepackage{hyperref}
|
||||
|
||||
\newcommand{\yourname}{Simon Moser}
|
||||
\newcommand{\lecture}{Advanced Database and Information Systems}
|
||||
\newcommand{\project}{Project 2}
|
||||
\author{\yourname}
|
||||
\title{\lecture}
|
||||
\subtitle{\project}
|
||||
|
||||
\pagestyle{scrheadings}
|
||||
\setkomafont{pagehead}{\normalfont}
|
||||
\lohead{\lecture\\\yourname}
|
||||
\lehead{\lecture\\\yourname}
|
||||
\rohead{\project}
|
||||
\rehead{\project}
|
||||
|
||||
\begin{document}
|
||||
\maketitle
|
||||
\section{Problem Statement}
|
||||
\label{sec:problem-statement}
|
||||
This report presents different approaches on Join\cite{wikijoin} algorithms on tables.
|
||||
When two tables are joined, all columns are combined based on one related column in each table.
|
||||
Every possible combination of rows where those columns are equal is returned.
|
||||
A very simple approach would be to loop through the first table and for each row loop through the second table and check for equality of the related column.
|
||||
Since the complexity of this approach is $O(M*N)$ for table sizes $M$ and $N$ and therefore very high, the following sections explain different approaches to this problem.
|
||||
|
||||
\section{Algorithm Description}
|
||||
\label{sec:algorithm-description}
|
||||
\subsection{Hash Join}
|
||||
\label{subsec:hash-join}
|
||||
The Hash Join\cite{wikihash} algorithm tries to offer a faster solution by using hash tables.
|
||||
The rough description of the algorithm is:
|
||||
\begin{enumerate}
|
||||
\item Add each row of the smaller table to the hash table, where compared column is used to create the hash.
|
||||
\item For each row in the other table, lookup the hash of it's compared column
|
||||
\item If the hash exists in the hash table, compare the two rows whether not only the hash is equal
|
||||
\end{enumerate}
|
||||
|
||||
One caveat of this approach is that the speed relies on the hash table being in-memory.
|
||||
If it exceeds the size of the memory, it will be slowed down.
|
||||
|
||||
This algorithm is expected to have a complexity of $O(M+N)$ because the first table has to be looped to build the hash table and the second table has to be looped to check the hash table.
|
||||
Here, a hash table access is expected to have a complexity of $O(1)$.
|
||||
|
||||
\subsection{Sort Merge Join}
|
||||
\label{subsec:sort-merge-join}
|
||||
A Sort Merge Join\cite{wikimergesort} tries to restrict the complexity by using good sort algorithms.
|
||||
For this approach, both tables are sorted by the compared column first.
|
||||
Then a pointer is set on the start of each column.
|
||||
Now for each step the related columns for each pointer are compared.
|
||||
If they are equal, the rows are added to the result.
|
||||
If not, the pointer in the table with the smaller value is advanced by one row.
|
||||
|
||||
This approach relies on two things: first, the content of the compared column has to be sortable in any way.
|
||||
Second, since both tables need to be sorted first, an effective sort algorithm has to be used.
|
||||
|
||||
Merge sort of the two tables has a complexity of $O(M \log_2 M + N \log N)$.
|
||||
This exceeds the complexity of the approach in subsection\ \ref{subsec:hash-join}.
|
||||
Still, this algorithm can be more effective when the tables are already sorted.
|
||||
In this case, the complexity drops to $O(M+N)$ that is required for walking through the tables.
|
||||
|
||||
\section{Dataset Description}
|
||||
\label{sec:dataset-description}
|
||||
For the analysis, the WatDiv\cite{watdiv} dataset is used.
|
||||
It consists out of the following entity types:
|
||||
\begin{itemize}
|
||||
\item wsdbm:User
|
||||
\item wsdbm:Product
|
||||
\item wsdbm:Review
|
||||
\end{itemize}
|
||||
|
||||
They have the following relations:
|
||||
\begin{itemize}
|
||||
\item wsdbm:User wsdbm:follows wsdbm:User
|
||||
\item wsdbm:User wsdbm:friendOf wsdbm:User
|
||||
\item wsdbm:User wsdbm:likes wsdbm:Product
|
||||
\item wsdbm:Product rev:hasReview wsdbm:Review
|
||||
\end{itemize}
|
||||
|
||||
\section{Experiment and Analysis}
|
||||
\label{sec:experiment-and-analysis}
|
||||
\subsection{Preparations}
|
||||
\label{subsec:preparations}
|
||||
To be able to import the dataset using the Python library rdflib\cite{rdflib}, the file contents were brought to an url format using the following bash command:
|
||||
|
||||
\verb$ sed -E 's/([a-zA-Z0-9]+:[^ \t]+)/<\1>/g'$
|
||||
|
||||
\subsection{Implementation}
|
||||
\label{subsec:implementation}
|
||||
The algorithms were implemented using Python.
|
||||
It might not offer the fastest way, but as an interpreted language Python is very suitable for fast-paced development.
|
||||
The code (also of this report) is published at: \url{https://naclador.de/mosers/ADBIS-Projekt2}
|
||||
|
||||
\subsection{Analysis}
|
||||
\label{subsec:analysis}
|
||||
Due to a lack of time, an existing bug in the sort merge join could not be found and fixed.
|
||||
Therefore, the following results for the small dataset are not representative:
|
||||
|
||||
\begin{verbatim}
|
||||
5.67576265335083s for Hash Join (11415461 items)
|
||||
0.15003275871276855s for Sort Merge Join (1475 items)
|
||||
6.041101694107056s for Nested Loop Join (11415461 items)
|
||||
\end{verbatim}
|
||||
|
||||
As it can be seen, the number of result items for the sort merge join differs greatly.
|
||||
|
||||
For smaller tables, Hash Join is expected to be the fasted algorithm since the table can be kept in memory completely.
|
||||
For the bigger table, Sort Merge Join will be faster as it doesn't have a memory limit.
|
||||
|
||||
\section{Conclusion}
|
||||
\label{sec:conclusion}
|
||||
Join Algorithms should be used by use case.
|
||||
There is nothing like the fastest algorithm, for small tables Hash Join is very effective while Sort Merge Join is perfect for pre-sorted tables.
|
||||
The comparison to Nested Loop Join shows that a sophisticated algorithm is usually better than the easiest solution.
|
||||
|
||||
\begin{thebibliography}{watdiv}
|
||||
\bibitem{wikijoin}
|
||||
Join (SQL), Wikipedia, \url{https://en.wikipedia.org/wiki/Join_(SQL)}
|
||||
\bibitem{wikihash}
|
||||
Hash join, Wikipedia, \url{https://en.wikipedia.org/wiki/Hash_join}
|
||||
\bibitem{wikimergesort}
|
||||
Sort-Merge-Join, Wikipedia, \url{https://en.wikipedia.org/wiki/Sort-merge_join}
|
||||
\bibitem{watdiv}
|
||||
Waterloo SPARQL Diversity Test Suite, University of Waterloo, \url{https://dsg.uwaterloo.ca/watdiv/}
|
||||
\bibitem{rdflib}
|
||||
RDFLib, \url{https://rdflib.readthedocs.io}
|
||||
\end{thebibliography}
|
||||
\end{document}
|
Laden…
In neuem Issue referenzieren