commit 0ccbadc92fe71f70311881acf82e861e7af36ff8 Author: Simon Moser Date: Mon Dec 27 01:25:19 2021 +0100 Uploading thesis script diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..985488e --- /dev/null +++ b/LICENSE @@ -0,0 +1,9 @@ +MIT License + +Copyright (c) + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/config.ini b/config.ini new file mode 100644 index 0000000..f0bad20 --- /dev/null +++ b/config.ini @@ -0,0 +1,6 @@ +[Config] +target=http://104.196.182.131 +port=9090 +max_parts=1 +block_size=1 +quota=1024 diff --git a/proxy.py b/proxy.py new file mode 100644 index 0000000..1062426 --- /dev/null +++ b/proxy.py @@ -0,0 +1,324 @@ +from http import server, HTTPStatus +from hashlib import sha256 +from configparser import ConfigParser +from datetime import datetime +from re import search +from urllib import request +from urllib.error import HTTPError +from io import StringIO +from contextlib import contextmanager +from heapq import heappush, heappop +from itertools import count +from time import sleep +import threading +import portalocker +import os +import sys + +class CacheHandler(server.BaseHTTPRequestHandler): + """ This class implements caching of Range-Requests. + + It subclasses the generic BaseHTTPRequestHandler and + implements the HTTP GET, POST and HEAD methods. + Other methods can be added easily if needed. + + """ + + def do_GET(self): + """ This method is called when handling a GET request. + + It filters range requests, forwards non-range requests and + implements the caching by writing cached data to a sparse file + and saving the information about which block is cached in another + file. + + """ + rrange = self.headers.get("Range") + if rrange is None or "," in rrange: + try: + headers = dict(self.headers.items()) + req = request.Request(target + self.path, None, headers) + answer = request.urlopen(req) + self.send_response(answer.getcode()) + for tup in answer.items(): + self.send_header(tup[0], tup[1]) + self.end_headers() + self.wfile.write(answer.read()) + except HTTPError as e: + self.send_error(e.code, e.msg) + regex = search('([A-Za-z]+)=(\d+)\s*-\s*(\d+)', rrange) + start = int(regex.group(2)) + start_sector = start//block_size + end = int(regex.group(3)) + length = end-start+1 + length_sector = length//block_size + m = sha256() + m.update(self.path.encode('utf-8')) + filename = "cache_" + m.hexdigest() + data_filename = filename + ".data" + missing_parts = 0 + threadLock.acquire() + try: + with self.filelock(filename, 'r') as f: + f.seek(start_sector) + cached_sectors = f.read(length_sector) + last_exists = True + cur = sector = missing_start = 0 + for cur, sector in enumerate(cached_sectors, start_sector): + if sector is "1" and not last_exists: + last_exists = True + if missing_parts is 0: + missing_parts = (missing_start*block_size, (cur-1)*block_size) + else: + missing_parts = (start, start+length-1) + break + elif not sector is "1" and last_exists: + last_exists = False + missing_start = cur + if not last_exists: + if missing_parts is 0: + missing_parts = (missing_start*block_size, cur*block_size) + else: + missing_parts = (start, start+length-1) + except OSError: + try: + req = request.Request(target + self.path, None, {}, None, False, 'HEAD') + size = int(request.urlopen(req).getheader("Content-Length")) + except HTTPError as e: + self.send_error(e.code, e.msg) + return + with self.filelock(filename, 'w') as f: + f.truncate(size//block_size) + with self.filelock(data_filename, 'wb') as df: + df.truncate(size) + missing_parts = (start, start+length-1) + part_start, part_end = missing_parts + try: + part_length = part_end - part_start + 1 + rheaders = dict(self.headers.items()) + rheaders["Range"] = "bytes=" + str(part_start) + "-" + str(part_end) + req = request.Request(target + self.path, None, rheaders) + answer = request.urlopen(req) + data = answer.read() + if answer.getcode() == HTTPStatus.PARTIAL_CONTENT and len(data) == part_length: + with self.filelock(data_filename, 'rb+') as f: + f.seek(part_start) + f.write(data) + with self.filelock(filename, 'r+') as f: + f.seek(part_start//block_size) + for i in range(part_length//block_size): + f.write("1") + self.check_cache_size() + else: + raise HTTPError( + req.get_full_url(), + HTTPStatus.BAD_GATEWAY, + "", + rheaders, + StringIO("") + ) + except HTTPError as e: + self.send_error(e.code, e.msg) + return + with self.filelock(data_filename, 'rb') as f: + f.seek(start) + data = f.read(length) + threadLock.release() + size = os.stat(data_filename).st_size + self.send_response(HTTPStatus.PARTIAL_CONTENT) + self.send_header("Content-Length", length) + self.send_header("Content-Range", "bytes "+str(start)+"-"+str(end)+"/"+str(size)) + self.end_headers() + self.wfile.write(data) + + + @contextmanager + def filelock(self, fp, mode = "r+"): + """ This method opens a file and locks it according to the mode. + + The portalocker module is used to lock the requested file. + If a read only mode is chosen, a shared lock is used, + otherwise the lock is exclusive. The contextmanager closes + the file after leaving the "with" block and the lock is released + after closing automatically. + + Args: + fp (str): The file path to open. + mode (str, optional): The mode used by open. Defaults to 'r+' + + """ + file = open(fp, mode) + if mode is 'r' or mode is 'rb': + flag = portalocker.LOCK_SH + else: + flag = portalocker.LOCK_EX + portalocker.lock(file, flag) + try: + yield file + finally: + file.close() + + + def do_POST(self): + """ This method is called when handling a POST request. + + It forward the headers and data of the original request to the + image archive and returns the answer to the client. + + """ + try: + headers = dict(self.headers.items()) + req = request.Request(target + self.path, self.rfile.read(), headers, None, False, 'POST') + answer = request.urlopen(req) + self.send_response(answer.getcode()) + for tup in answer.getheaders(): + self.send_header(tup[0], tup[1]) + self.end_headers() + self.wfile.write(answer.read()) + except HTTPError as e: + self.send_error(e.code, e.msg) + + + def do_HEAD(self): + """ This method is called when handling a HEAD request. + + It forwards the headers of the original request to the + image archive and returns the answer to the client. + + """ + try: + headers = dict(self.headers.items()) + req = request.Request(target + self.path, None, headers, None, False, 'HEAD') + answer = request.urlopen(req) + self.send_response(answer.getcode()) + for tup in answer.getheaders(): + self.send_header(tup[0], tup[1]) + self.end_headers() + self.wfile.write(answer.read()) + except HTTPError as e: + self.send_error(e.code, e.msg) + + +class QuotaCheck (threading.Thread): + """ This class checks for exceeding the allowed disk space and cleans update. + + It subclasses the threading.Thread to run independently + from the proxy thread. + + """ + + def __init__(self, quota): + """ This method is called to initiate the class. + + It calls the super classes __init__ and sets the selected quota. + + Args: + quota (int): The chosen disk usage limit. + + """ + threading.Thread.__init__(self) + self.quota = quota + + + def run(self): + """This method is called by the super class to run the thread. + + It is called by the super classes start method. + It checks whether the space is exceeded. If this is the case, + it locks the proxys thread to avoid problems and cleans the cache. + + """ + while True: + sleep(1) + if self.cache_full(): + threadLock.acquire() + self.clean_cache() + threadLock.release() + + + def cache_full(self): + """ This method checks whether the disk space is exceeded. """ + total_size = 0 + for f in os.scandir(): + if f.name.startswith("cache_") and f.is_file(): + total_size += f.stat().st_blocks/2048 + return total_size > self.quota + + + def clean_cache(self): + """This methods reduces the disk usage to fulfill the quota + + It creates a priority queue of cache files ordered by the time + of the last access and deletes the files whose last access + was longest ago until the quota is fulfilled. + + """ + total_size = 0 + h = [] + c = count() + for f in os.scandir(): + if f.name.startswith("cache_") and f.is_file(): + total_size += f.stat().st_blocks/2048 + heappush(h, (f.stat().st_atime_ns, next(c), f)) + while total_size > self.quota: + f = heappop(h)[2] + os.remove(f.path) + total_size -= f.stat().st_blocks/2048 + + +class ServerThread (threading.Thread): + """ This class runs the proxy as a separate thread. + + It subclasses the threading.Thread to run independently + from the quota thread. + + """ + + def __init__(self): + """ This method is called to initiate the class. + + It calls the super classes __init__. + + """ + threading.Thread.__init__(self) + self.httpd = server.HTTPServer(server_address, CacheHandler) + + + def run(self): + """This method is called by the super class to run the thread. + + It is called by the super classes start method. + It runs the proxys method to check for incoming requests forever. + + """ + self.httpd.serve_forever() + + +if __name__ == '__main__': + """ This code starts the proxy and the quota threads + + It is executed when the file is executed. + It reads the config file and sets the globals, creates the threads + and waits for the proxy thread to finish (which will not happen). + It catches a KeyboardInterrupt and ends the program if recieved. + + """ + config = ConfigParser() + config.read('config.ini') + server_address = ('', int(config['Config']['port'])) + global block_size + block_size = int(config['Config']['block_size']) + global target + target = config['Config']['target'] + global threadLock + threadLock = threading.Lock() + t1 = QuotaCheck(int(config['Config']['quota'])) + t2 = ServerThread() + t1.start() + t2.start() + try: + while t2.isAlive(): + t2.join(1) + except KeyboardInterrupt: + print("Interrupt recieved, stopping...") + sys.exit() diff --git a/thesis.pdf b/thesis.pdf new file mode 100644 index 0000000..700a368 Binary files /dev/null and b/thesis.pdf differ