From 3dbcbfd358cef891c45bada7a46e9918847d5d34 Mon Sep 17 00:00:00 2001 From: Matthew Leick-Macari Date: Wed, 23 Jan 2019 22:35:38 -0600 Subject: [PATCH] updated presentation --- .../Finding Duplicates Using Hashing.ipynb | 257 +++++++++++++++++- 1 file changed, 245 insertions(+), 12 deletions(-) diff --git a/tutorials/201901_finding_duplicate_files/Finding Duplicates Using Hashing.ipynb b/tutorials/201901_finding_duplicate_files/Finding Duplicates Using Hashing.ipynb index 707329f..1caad8c 100644 --- a/tutorials/201901_finding_duplicate_files/Finding Duplicates Using Hashing.ipynb +++ b/tutorials/201901_finding_duplicate_files/Finding Duplicates Using Hashing.ipynb @@ -6,7 +6,7 @@ "source": [ "# How to Detect Duplicates Using Hashing\n", "\n", - "#TODO: Write me.." + "This is a high level introduction into hashing, and then how to detect duplicate objects through hashing." ] }, { @@ -16,11 +16,15 @@ "outputs": [], "source": [ "from binascii import hexlify\n", + "from base64 import urlsafe_b64encode\n", "from datetime import datetime\n", "import hashlib\n", "from hmac import compare_digest\n", "import json\n", "import os\n", + "import random\n", + "import string\n", + "import uuid\n", "\n", "import requests as r" ] @@ -33,14 +37,18 @@ "\n", "### What is a hash?\n", "\n", - "A hash function is a deterministic mathematical function that takes an input of any length and content (e.g. letters, numbers, and symbols) and uses a formula to produce an output of a specific length. \n", + "A hash function is a deterministic mathematical function that takes an input of any length and content (e.g. letters, numbers, and symbols) and uses a formula to produce an output of a specific length.\n", + "\n", + "A hash function differs from an encryption algorithm in that hash functions are not reversible whereas encryption algorithms are reversible. \n", "\n", "### What can be hashed?\n", "\n", "A hash can be created using nearly any form of digitial content: a document, image, song, etc.\n", "\n", "\n", - "reference: [What is hashing?](https://medium.com/tech-tales/what-is-hashing-6edba0ebfa67)" + "references: \n", + "* [What is hashing?](https://medium.com/tech-tales/what-is-hashing-6edba0ebfa67)\n", + "* [About Secure Password Hashing](https://security.blogoverflow.com/2013/09/about-secure-password-hashing/)" ] }, { @@ -115,6 +123,46 @@ "print('Image hash: ' + picture_hash.hexdigest())" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Hashing a large amount of data\n", + "# This is not cryptographically secure\n", + "\n", + "def string_generator(size=6, chars=string.ascii_uppercase + string.digits):\n", + " return ''.join(random.choice(chars) for _ in range(size))\n", + "\n", + "random_str_1 = string_generator(size=10**2)\n", + "\n", + "md5_hash_rand_str_1 = hashlib.md5(random_str_1.encode())\n", + "sha256_hash_rand_str_1 = hashlib.sha256(random_str_1.encode())\n", + "sha512_hash_rand_str_1 = hashlib.sha512(random_str_1.encode())\n", + "\n", + "print('MD5: ' + md5_hash_rand_str_1.hexdigest())\n", + "print('SHA-256: ' + sha256_hash_rand_str_1.hexdigest())\n", + "print('SHA-512: ' + sha512_hash_rand_str_1.hexdigest())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "random_str_2 = string_generator(size=10**5)\n", + "\n", + "md5_hash_rand_str_2 = hashlib.md5(random_str_2.encode())\n", + "sha256_hash_rand_str_2 = hashlib.sha256(random_str_2.encode())\n", + "sha512_hash_rand_str_2 = hashlib.sha512(random_str_2.encode())\n", + "\n", + "print('MD5: ' + md5_hash_rand_str_2.hexdigest())\n", + "print('SHA-256: ' + sha256_hash_rand_str_2.hexdigest())\n", + "print('SHA-512: ' + sha512_hash_rand_str_2.hexdigest())" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -194,9 +242,36 @@ "source": [ "## Cryptographically Secure Hashing\n", "\n", - "### Overview" + "### Overview\n", + "\n", + "The primary difference between encryption and hashing is that encryption is reversible; however, hashing is not reversible.\n", + "\n", + "A hashing functions is a _cryptographic hash functions_ when it has the following properties:\n", + " \n", + " * It is easy to compute the hash value for any given input.\n", + " * It is infeasible to generate the given input from a given hash.\n", + " * If is infeasible to modify the input without modifying the hash.\n", + " * It is infeasible for two different inputs to produce the same hash.\n", + " \n", + "The hash functions should be resistant against:\n", + "\n", + " * Collisions\n", + " * Pre-image resistance - Given a hash h it should be difficult to find any input m such that h = hash(m)\n", + " * Second-preimages - given m, it is infeasible to find m' distinct from m such that hash(m) = hash(m')\n", + " \n", + "### Modern Hashing Algorithms\n", + "\n", + " * MD-5 is a hashing algorithm that is widely used, but is cryptographically flawed because it is prone to collisions. MD-5 is broken in terms of collisions, but still is resistant in terms of pre-images and secod-preimages.\n", + " * SHA-256/SHA-512 are hashing functions that are similar, but work on different block sizes. These were designed by the NSA" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -243,6 +318,27 @@ "print(verify(invalid_cookie_str, sig))" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Generating a random password and hashing.\n", + "# Note: Don't actually use this in a system. It is far better to use a package written by an expert.\n", + "\n", + "password = 'spam_me_password'\n", + "salt = urlsafe_b64encode(uuid.uuid4().bytes)\n", + "\n", + "print('Salt: ' + salt.decode())\n", + "\n", + "hasher = hashlib.sha512()\n", + "hasher.update(password.encode() + salt)\n", + "hashed_password = urlsafe_b64encode(hasher.digest())\n", + "\n", + "print('Hashed password:' + hashed_password.decode())" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -298,10 +394,10 @@ "g.close()\n", "\n", "f_hash = hash_file(os.path.join(temp_dir, 'f.txt'))\n", - "print(f_hash)\n", + "print('f.txt hash: ' + f_hash.decode())\n", "\n", "g_hash = hash_file(os.path.join(temp_dir, 'g.txt'))\n", - "print(g_hash)\n", + "print('g.txt hash: ' + g_hash.decode())\n", "\n", "compare_digest(f_hash, g_hash)" ] @@ -309,11 +405,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "# Generate random files.\n", "FILE_NUM = 1000\n", + "DUP_FILE_NUM = 10\n", "\n", "random_data = (os.urandom(2048) for _ in range(0, FILE_NUM))\n", "for indx, random_datum in enumerate(random_data):\n", @@ -321,13 +420,15 @@ " f.write(random_datum)\n", "\n", "# generate 1 duplicate file\n", - "random_file = os.path.join(temp_dir, '{}.txt'.format(str(random.randint(0, FILE_NUM - 1))))\n", + "random_files = [os.path.join(temp_dir, '{}.txt'.format(str(random.randint(0, FILE_NUM - 1)))) for _ in range(DUP_FILE_NUM)]\n", "\n", - "with open(random_file, 'rb') as f:\n", - " with open(os.path.join(temp_dir, 'dup_file.txt'), 'wb') as g:\n", - " g.write(f.read())\n", + "for random_file in random_files:\n", + " with open(random_file, 'rb') as f:\n", + " dup_file = os.path.basename(random_file)\n", + " with open(os.path.join(temp_dir, 'dup_file_{}.txt'.format(dup_file)), 'wb') as g:\n", + " g.write(f.read())\n", " \n", - "print(random_file)" + "print(random_files)" ] }, { @@ -366,6 +467,138 @@ "\n", "os.path.exists(temp_dir)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Hashable Objects\n", + "\n", + "The builtin method `hash()` returns the hash value of an object if it has one, and returns an integer value. This is used to quickly find dictionary keys during a dictionary lookup." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(hash('Spam!'))\n", + "\n", + "print(hash('I am a comnputer?'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Tuples have a hash value\n", + "hash((1, 2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lists don't have a hash value\n", + "hash([1,2,3])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class Alpaca:\n", + " \n", + " def __init__(self, name, color):\n", + " self.name = name\n", + " self.color = color\n", + " \n", + " def __eq__(self, other):\n", + " return self.name == other.name and self.color == other.color\n", + " \n", + " def __hash__(self):\n", + " return hash((self.name, self.color))\n", + " \n", + " def __repr__(self):\n", + " return 'Alpaca({}, {})'.format(self.name, self.color)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "alpaca_1 = Alpaca('Bob', 'purple')\n", + "alpaca_2 = Alpaca('Erin', 'blue')\n", + "\n", + "print(hash(alpaca_1))\n", + "print(hash(alpaca_2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "alpaca_1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "alpaca_farm = {}\n", + "alpaca_farm.update({alpaca_1: alpaca_1})\n", + "alpaca_farm.update({alpaca_2: alpaca_2})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "alpaca_farm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "new_alpaca = Alpaca('Bob', 'purple')\n", + "child_alpaca = Alpaca('Anna', 'yellow')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "new_alpaca in alpaca_farm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "child_alpaca in alpaca_farm" + ] } ], "metadata": {