Report added
Ursprung
c5938d23a2
Commit
d22178dfab
@ -1,3 +1,6 @@
|
|||||||
# created by virtualenv automatically
|
# created by virtualenv automatically
|
||||||
venv/*
|
venv/*
|
||||||
.idea/*
|
.idea/*
|
||||||
|
ADBISProject2.pdf
|
||||||
|
watdiv*
|
||||||
|
Report/out
|
@ -0,0 +1,107 @@
|
|||||||
|
from rdflib import Graph, namespace
|
||||||
|
from collections import defaultdict
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
def merge_tuples(a_tup, b_tup, b_excl):
|
||||||
|
b_list = list(b_tup)
|
||||||
|
del b_list[b_excl]
|
||||||
|
return tuple(list(a_tup)+b_list)
|
||||||
|
|
||||||
|
|
||||||
|
def hash_join(a_table, b_table, a_index=1, b_index=0):
|
||||||
|
# Create a hash dict and fill it with the first table
|
||||||
|
hash_dict = defaultdict(list)
|
||||||
|
for a_tup in a_table:
|
||||||
|
hash_dict[a_tup[a_index]].append(a_tup)
|
||||||
|
# For each element in second table check elements from first table by index
|
||||||
|
return [merge_tuples(a_tup, b_tup, b_index) for b_tup in b_table for a_tup in hash_dict[b_tup[b_index]]]
|
||||||
|
|
||||||
|
|
||||||
|
class SortMergeJoin:
|
||||||
|
def get_key(self, a):
|
||||||
|
return self.selected_tuples[a][self.indexes[a]]
|
||||||
|
|
||||||
|
def active_key(self):
|
||||||
|
return self.get_key(self.a_is_active)
|
||||||
|
|
||||||
|
def inactive_key(self):
|
||||||
|
return self.get_key(not self.a_is_active)
|
||||||
|
|
||||||
|
def __init__(self, a_table, b_table, a_index=1, b_index=0):
|
||||||
|
# First sort both tables and create iterators from that
|
||||||
|
self.iterators = {
|
||||||
|
True: iter(sorted(a_table, key=lambda tup: tup[a_index])),
|
||||||
|
False: iter(sorted(b_table, key=lambda tup: tup[b_index]))
|
||||||
|
}
|
||||||
|
self.indexes = {
|
||||||
|
True: a_index,
|
||||||
|
False: b_index
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
self.selected_tuples = {
|
||||||
|
True: next(self.iterators[True]),
|
||||||
|
False: next(self.iterators[False])
|
||||||
|
}
|
||||||
|
except StopIteration:
|
||||||
|
return
|
||||||
|
self.a_is_active = True
|
||||||
|
self.result = list()
|
||||||
|
|
||||||
|
def join(self):
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
while self.active_key() <= self.inactive_key():
|
||||||
|
if self.active_key() == self.inactive_key():
|
||||||
|
self.result.append(merge_tuples(self.selected_tuples[True], self.selected_tuples[False], self.indexes[False]))
|
||||||
|
self.selected_tuples[self.a_is_active] = next(self.iterators[self.a_is_active])
|
||||||
|
self.a_is_active = not self.a_is_active
|
||||||
|
except StopIteration:
|
||||||
|
break
|
||||||
|
return self.result
|
||||||
|
|
||||||
|
|
||||||
|
def nested_loop_join(a_table, b_table, a_index=1, b_index=0):
|
||||||
|
return [merge_tuples(a_tup, b_tup, b_index) for a_tup in a_table for b_tup in b_table if a_tup[a_index] is b_tup[b_index]]
|
||||||
|
|
||||||
|
|
||||||
|
def compare(graph):
|
||||||
|
print(f"{len(graph)} records loaded")
|
||||||
|
wsdbm = namespace.Namespace("wsdbm:")
|
||||||
|
rev = namespace.Namespace("rev:")
|
||||||
|
properties = {
|
||||||
|
"follow": wsdbm.follows,
|
||||||
|
"friend": wsdbm.friendOf,
|
||||||
|
"like": wsdbm.likes,
|
||||||
|
"review": rev.hasReview
|
||||||
|
}
|
||||||
|
comp_tables = dict()
|
||||||
|
for p in properties:
|
||||||
|
comp_tables[p] = [(s.n3(), o.n3()) for s, _, o in graph.triples((None, properties[p], None))]
|
||||||
|
|
||||||
|
hash_start = time.time()
|
||||||
|
joinh1 = hash_join(comp_tables["follow"], comp_tables["friend"])
|
||||||
|
joinh2 = hash_join(joinh1, comp_tables["like"], 2)
|
||||||
|
joinh3 = hash_join(joinh2, comp_tables["review"], 3)
|
||||||
|
print(f"{time.time()-hash_start}s for Hash Join ({len(joinh3)} items)")
|
||||||
|
|
||||||
|
merge_sort_start = time.time()
|
||||||
|
joinsm1 = SortMergeJoin(comp_tables["follow"], comp_tables["friend"]).join()
|
||||||
|
joinsm2 = SortMergeJoin(joinsm1, comp_tables["like"], 2).join()
|
||||||
|
joinsm3 = SortMergeJoin(joinsm2, comp_tables["review"], 3).join()
|
||||||
|
print(f"{time.time()-merge_sort_start}s for Sort Merge Join ({len(joinsm3)} items)")
|
||||||
|
|
||||||
|
loop_start = time.time()
|
||||||
|
joinnl1 = hash_join(comp_tables["follow"], comp_tables["friend"])
|
||||||
|
joinnl2 = hash_join(joinnl1, comp_tables["like"], 2)
|
||||||
|
joinnl3 = hash_join(joinnl2, comp_tables["review"], 3)
|
||||||
|
print(f"{time.time()-loop_start}s for Nested Loop Join ({len(joinnl3)} items)")
|
||||||
|
|
||||||
|
|
||||||
|
g = Graph()
|
||||||
|
g.parse("watdiv-url100k.txt", format="nt")
|
||||||
|
compare(g)
|
||||||
|
|
||||||
|
h = Graph()
|
||||||
|
h.parse("watdiv.10M.nt", format="nt")
|
||||||
|
compare(h)
|
@ -0,0 +1,152 @@
|
|||||||
|
%! Author = mrmcx
|
||||||
|
%! Date = 14.07.2022
|
||||||
|
%! Template = Fabian Wenzelmann, 2016--2019
|
||||||
|
|
||||||
|
\documentclass[a4paper,
|
||||||
|
twoside, % to have to sided mode
|
||||||
|
headlines=2.1 % number of lines in the heading, increase if you want more
|
||||||
|
]{scrartcl}
|
||||||
|
\usepackage[
|
||||||
|
margin=2cm,
|
||||||
|
includefoot,
|
||||||
|
footskip=35pt,
|
||||||
|
includeheadfoot,
|
||||||
|
headsep=0.5cm,
|
||||||
|
]{geometry}
|
||||||
|
\usepackage[utf8]{inputenc}
|
||||||
|
\usepackage[english]{babel}
|
||||||
|
\usepackage[T1]{fontenc}
|
||||||
|
\usepackage{mathtools}
|
||||||
|
\usepackage{amssymb}
|
||||||
|
\usepackage{lmodern}
|
||||||
|
\usepackage[automark,headsepline]{scrlayer-scrpage}
|
||||||
|
\usepackage{enumerate}
|
||||||
|
\usepackage[protrusion=true,expansion=true,kerning]{microtype}
|
||||||
|
\usepackage{hyperref}
|
||||||
|
|
||||||
|
\newcommand{\yourname}{Simon Moser}
|
||||||
|
\newcommand{\lecture}{Advanced Database and Information Systems}
|
||||||
|
\newcommand{\project}{Project 2}
|
||||||
|
\author{\yourname}
|
||||||
|
\title{\lecture}
|
||||||
|
\subtitle{\project}
|
||||||
|
|
||||||
|
\pagestyle{scrheadings}
|
||||||
|
\setkomafont{pagehead}{\normalfont}
|
||||||
|
\lohead{\lecture\\\yourname}
|
||||||
|
\lehead{\lecture\\\yourname}
|
||||||
|
\rohead{\project}
|
||||||
|
\rehead{\project}
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
\maketitle
|
||||||
|
\section{Problem Statement}
|
||||||
|
\label{sec:problem-statement}
|
||||||
|
This report presents different approaches on Join\cite{wikijoin} algorithms on tables.
|
||||||
|
When two tables are joined, all columns are combined based on one related column in each table.
|
||||||
|
Every possible combination of rows where those columns are equal is returned.
|
||||||
|
A very simple approach would be to loop through the first table and for each row loop through the second table and check for equality of the related column.
|
||||||
|
Since the complexity of this approach is $O(M*N)$ for table sizes $M$ and $N$ and therefore very high, the following sections explain different approaches to this problem.
|
||||||
|
|
||||||
|
\section{Algorithm Description}
|
||||||
|
\label{sec:algorithm-description}
|
||||||
|
\subsection{Hash Join}
|
||||||
|
\label{subsec:hash-join}
|
||||||
|
The Hash Join\cite{wikihash} algorithm tries to offer a faster solution by using hash tables.
|
||||||
|
The rough description of the algorithm is:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Add each row of the smaller table to the hash table, where compared column is used to create the hash.
|
||||||
|
\item For each row in the other table, lookup the hash of it's compared column
|
||||||
|
\item If the hash exists in the hash table, compare the two rows whether not only the hash is equal
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
One caveat of this approach is that the speed relies on the hash table being in-memory.
|
||||||
|
If it exceeds the size of the memory, it will be slowed down.
|
||||||
|
|
||||||
|
This algorithm is expected to have a complexity of $O(M+N)$ because the first table has to be looped to build the hash table and the second table has to be looped to check the hash table.
|
||||||
|
Here, a hash table access is expected to have a complexity of $O(1)$.
|
||||||
|
|
||||||
|
\subsection{Sort Merge Join}
|
||||||
|
\label{subsec:sort-merge-join}
|
||||||
|
A Sort Merge Join\cite{wikimergesort} tries to restrict the complexity by using good sort algorithms.
|
||||||
|
For this approach, both tables are sorted by the compared column first.
|
||||||
|
Then a pointer is set on the start of each column.
|
||||||
|
Now for each step the related columns for each pointer are compared.
|
||||||
|
If they are equal, the rows are added to the result.
|
||||||
|
If not, the pointer in the table with the smaller value is advanced by one row.
|
||||||
|
|
||||||
|
This approach relies on two things: first, the content of the compared column has to be sortable in any way.
|
||||||
|
Second, since both tables need to be sorted first, an effective sort algorithm has to be used.
|
||||||
|
|
||||||
|
Merge sort of the two tables has a complexity of $O(M \log_2 M + N \log N)$.
|
||||||
|
This exceeds the complexity of the approach in subsection\ \ref{subsec:hash-join}.
|
||||||
|
Still, this algorithm can be more effective when the tables are already sorted.
|
||||||
|
In this case, the complexity drops to $O(M+N)$ that is required for walking through the tables.
|
||||||
|
|
||||||
|
\section{Dataset Description}
|
||||||
|
\label{sec:dataset-description}
|
||||||
|
For the analysis, the WatDiv\cite{watdiv} dataset is used.
|
||||||
|
It consists out of the following entity types:
|
||||||
|
\begin{itemize}
|
||||||
|
\item wsdbm:User
|
||||||
|
\item wsdbm:Product
|
||||||
|
\item wsdbm:Review
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
They have the following relations:
|
||||||
|
\begin{itemize}
|
||||||
|
\item wsdbm:User wsdbm:follows wsdbm:User
|
||||||
|
\item wsdbm:User wsdbm:friendOf wsdbm:User
|
||||||
|
\item wsdbm:User wsdbm:likes wsdbm:Product
|
||||||
|
\item wsdbm:Product rev:hasReview wsdbm:Review
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\section{Experiment and Analysis}
|
||||||
|
\label{sec:experiment-and-analysis}
|
||||||
|
\subsection{Preparations}
|
||||||
|
\label{subsec:preparations}
|
||||||
|
To be able to import the dataset using the Python library rdflib\cite{rdflib}, the file contents were brought to an url format using the following bash command:
|
||||||
|
|
||||||
|
\verb$ sed -E 's/([a-zA-Z0-9]+:[^ \t]+)/<\1>/g'$
|
||||||
|
|
||||||
|
\subsection{Implementation}
|
||||||
|
\label{subsec:implementation}
|
||||||
|
The algorithms were implemented using Python.
|
||||||
|
It might not offer the fastest way, but as an interpreted language Python is very suitable for fast-paced development.
|
||||||
|
The code (also of this report) is published at: \url{https://naclador.de/mosers/ADBIS-Projekt2}
|
||||||
|
|
||||||
|
\subsection{Analysis}
|
||||||
|
\label{subsec:analysis}
|
||||||
|
Due to a lack of time, an existing bug in the sort merge join could not be found and fixed.
|
||||||
|
Therefore, the following results for the small dataset are not representative:
|
||||||
|
|
||||||
|
\begin{verbatim}
|
||||||
|
5.67576265335083s for Hash Join (11415461 items)
|
||||||
|
0.15003275871276855s for Sort Merge Join (1475 items)
|
||||||
|
6.041101694107056s for Nested Loop Join (11415461 items)
|
||||||
|
\end{verbatim}
|
||||||
|
|
||||||
|
As it can be seen, the number of result items for the sort merge join differs greatly.
|
||||||
|
|
||||||
|
For smaller tables, Hash Join is expected to be the fasted algorithm since the table can be kept in memory completely.
|
||||||
|
For the bigger table, Sort Merge Join will be faster as it doesn't have a memory limit.
|
||||||
|
|
||||||
|
\section{Conclusion}
|
||||||
|
\label{sec:conclusion}
|
||||||
|
Join Algorithms should be used by use case.
|
||||||
|
There is nothing like the fastest algorithm, for small tables Hash Join is very effective while Sort Merge Join is perfect for pre-sorted tables.
|
||||||
|
The comparison to Nested Loop Join shows that a sophisticated algorithm is usually better than the easiest solution.
|
||||||
|
|
||||||
|
\begin{thebibliography}{watdiv}
|
||||||
|
\bibitem{wikijoin}
|
||||||
|
Join (SQL), Wikipedia, \url{https://en.wikipedia.org/wiki/Join_(SQL)}
|
||||||
|
\bibitem{wikihash}
|
||||||
|
Hash join, Wikipedia, \url{https://en.wikipedia.org/wiki/Hash_join}
|
||||||
|
\bibitem{wikimergesort}
|
||||||
|
Sort-Merge-Join, Wikipedia, \url{https://en.wikipedia.org/wiki/Sort-merge_join}
|
||||||
|
\bibitem{watdiv}
|
||||||
|
Waterloo SPARQL Diversity Test Suite, University of Waterloo, \url{https://dsg.uwaterloo.ca/watdiv/}
|
||||||
|
\bibitem{rdflib}
|
||||||
|
RDFLib, \url{https://rdflib.readthedocs.io}
|
||||||
|
\end{thebibliography}
|
||||||
|
\end{document}
|
Laden…
In neuem Issue referenzieren