From 791494e985ba7d3c91f44a9a67593fcbfb497206 Mon Sep 17 00:00:00 2001 From: Muhammad Abdul-Mageed Date: Sat, 11 Mar 2017 22:58:41 -0800 Subject: [PATCH 01/11] Adding regular expressions unit --- regular_expressions.ipynb | 719 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 719 insertions(+) create mode 100644 regular_expressions.ipynb diff --git a/regular_expressions.ipynb b/regular_expressions.ipynb new file mode 100644 index 0000000..5eaf5fa --- /dev/null +++ b/regular_expressions.ipynb @@ -0,0 +1,719 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Regular Expressions in Python" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "Python provides a powerful regular expression module (re).\n", + "A regular expression is a special sequence of characters of which you can \n", + "think as rules that helps us match certain types of content \n", + "within string literals. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The backslash \"\\\" & Raw Strings" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "From your knowledge of string literals, you already know that \n", + "a backslash \"\\\" is interpreted by the Python parser as an escape\n", + "character. For example, in the following string, in order to use an internal quotes, we have to skip them by the backslash character \"\\\"." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is a string literal that has a quote \" character.\n" + ] + } + ], + "source": [ + "text= \"This is a string literal that has a quote \\\" character.\" \n", + "print(text)" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "The parser also interprets the backslash in specific ways when followed by \n", + "specific sequences of characters. For example, the parser replaces the \n", + "‘\\n’ excape sequence by a newline character." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This has a quote \" char followed by \n", + "\n", + "\n", + " three new lines!!.\n" + ] + } + ], + "source": [ + "text= \"This has a quote \\\" char followed by \\n\\n\\n three new lines!!.\" \n", + "print(text)" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "The re module itself also makes use of\n", + "backslash characters to escape special regex characters, which results in \n", + "us needing to having to escape the escape character itself at times.\n", + "This relsults in unreadable code. A good solution to this problem is to use what is known as a \"raw string\", which is simply achieved by prefixing\n", + "a string literal with the ‘r’ character (right before the opening quote of the string). When we do this, the parser will treat the string literal as is without attempting to make any internal substitutions. See the example below:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This has a quote \\\" char followed by \\n\\n\\n three new lines!!.\n" + ] + } + ], + "source": [ + "raw_text= r\"This has a quote \\\" char followed by \\n\\n\\n three new lines!!.\" \n", + "print(raw_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## And Three for the Road: match(), search(), and findall()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "<_sre.SRE_Match at 0x105bacf38>" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# match() will only find matches if they occur at the beginning of \n", + "# the searched string:\n", + "import re\n", + "text=\"apple berry orange berry\"\n", + "re.match(r'apple',text)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'apple'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#The above means there is a match and Python is returning the matching \n", + "# Object. \n", + "# We can access the matched pattern with: group(0)\n", + "my_match=re.match(r'apple',text)\n", + "my_match.group(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "None\n" + ] + } + ], + "source": [ + "# Since \"berry\" is not in the beginning of the string, there will be\n", + "# no match.\n", + "print(re.match(r'berry',text))" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<_sre.SRE_Match object at 0x106d33780>\n", + "<_sre.SRE_Match object at 0x106d33780>\n", + "<_sre.SRE_Match object at 0x106d33780>\n" + ] + } + ], + "source": [ + "# search() is like match(), excpet that it is not restricted to finding a match\n", + "# at the beginning: It will find a match anywhere in the string:\n", + "print(re.search(r'berry',text))\n", + "print(re.search(r'apple',text))\n", + "print(re.search(r'orange',text))" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'berry'" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Note that search() stops looking after it finds the first match.\n", + "# As such, even though there are wto examples of the string \"berry\",\n", + "# match() only returns one match (the first match)\n", + "my_berry_match=re.search(r'berry',text)\n", + "my_berry_match.group(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Start index: 6\n", + "End index: 11\n" + ] + } + ], + "source": [ + "# We can actually access the indexes of the matched \"berry\" string:\n", + "start=my_berry_match.start()\n", + "end=my_berry_match.end()\n", + "print(\"Start index: %s\" % start)\n", + "print(\"End index: %s\" % end)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "berry\n" + ] + } + ], + "source": [ + "print(text[6:11])" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['berry', 'berry']\n" + ] + } + ], + "source": [ + "# findall() is like search(), but is exhaustive: It finds all the matches\n", + "all_berry_matches=re.findall(r'berry',text)\n", + "print(all_berry_matches)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'berry'" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Since it returns a list of what matched, findall() does not work with\n", + "# grouping. Instead, just access each item in the returned list as \n", + "# what would have been a group \n", + "all_berry_matches[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### More on Grouping" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#hashtag1\n", + "#hashtag2\n" + ] + } + ], + "source": [ + "# We can surround certain surround certain parts of the regex in paranthese\n", + "# and access them later on via group numbers\n", + "tweet=\"This is a tweet with #hashtag1 and #hashtag2 https://cnn.com\"\n", + "my_hashtags=re.search(r'(#\\S+)\\s+\\S+\\s+(#\\S+)', tweet)\n", + "print(my_hashtags.group(1)) # whatever is in the first ()\n", + "print(my_hashtags.group(2)) # whatever is in the second ()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<_sre.SRE_Match object at 0x106be5d78>\n" + ] + } + ], + "source": [ + "print(my_hashtags)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " and \n" + ] + } + ], + "source": [ + "# We can surround certain surround certain parts of the regex in paranthese\n", + "# and access them later on via group numbers\n", + "tweet=\"This is a tweet with #hashtag1 and #hashtag2 https://cnn.com\"\n", + "my_hashtags=re.search(r'(#\\S+)(?P\\s+\\S+\\s+)(#\\S+)', tweet)\n", + "print(my_hashtags.group(\"my_and_group\")) " + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "None\n" + ] + } + ], + "source": [ + "#tweet=\"This is a tweet with #hashtag1 and #hashtag2 https://cnn.com\"\n", + "tweet_modified=\"This is a tweet with #hashtag1 #hashtag2 https://cnn.com\"\n", + "\n", + "my_hashtags=re.search(r'#\\S+\\s+\\S+\\s+#\\S+', tweet_modified)\n", + "print(my_hashtags)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# (#\\S+) matches a hashtag \"#\", followed by one or more non-whitespaces\n", + "#----------------------------------------\n", + "# \\s+ matches one or more whitespaces\n", + "#----------------------------------------\n", + "# \\s+\\S+\\s+: Basically matches the \" and \" in the tweet, \n", + "# (note the preceding and following spaces)." + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('#hashtag1', '#hashtag2')\n" + ] + } + ], + "source": [ + "# groups() will return all matched groups as a tuple:\n", + "print(my_hashtags.groups())" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['#hashtag1', '#hashtag2']\n" + ] + } + ], + "source": [ + "# The pattern with search() above is useful if you specifically wanted\n", + "# a pattern that has \"hashtag+space(s)+and+space(s)+hashtag\"\n", + "# If you want just to get all hashtags in a tweet, just use \"findall\"\n", + "my_hashtags=re.findall(r'(#\\S+)', tweet)\n", + "print(my_hashtags)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['https://cnn.com']\n" + ] + } + ], + "source": [ + "my_url=re.findall(r'(https://\\S+.\\S+)', tweet)\n", + "print(my_url)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compiling for Re-Use" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['#hashtag1', '#hashtag2', 'https://cnn.com']\n" + ] + } + ], + "source": [ + "# Compile a pattern for reuse.\n", + "#------------------------------\n", + "# The \"|\" helps us match a hashtag or an URL (so if both exist,\n", + "# we capture BOTH)\n", + "p=re.compile(r'(#\\S+|https://\\S+.\\S+)')\n", + "matches=re.findall(p, tweet) # \n", + "print(matches)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sidenote: re.sub with grouping" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is a tweet with and \n" + ] + } + ], + "source": [ + "# Using the paranthes to capture a group is useful\n", + "# if you wanted to substitute\n", + "new_tweet=re.sub(r'(#\\S+)', '', tweet)\n", + "print(new_tweet)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is a tweet with and \n" + ] + } + ], + "source": [ + "new_tweet=re.sub(r'(#\\S+)', '', tweet)\n", + "print(new_tweet)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Grouping by name" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#hashtag1\n", + "#hashtag2\n" + ] + } + ], + "source": [ + "# Add ?P before a pattern to group by name\n", + "my_hashtags=re.search(r'(?P#\\S+)\\s+\\S+\\s+(?P#\\S+)', tweet)\n", + "print(my_hashtags.group(\"first\")) # whatever is in the first ()\n", + "print(my_hashtags.group(\"second\")) # whatever is in the second ()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['interesting', 'last']" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Find all words with the character \"s\"\n", + "story=\"Samy told me an interesting story was airing on CBC last night...\"\n", + "re.findall(r'\\w+s\\w+', story)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['interesting', 'story', 'was', 'last']" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Since \\w* matches zero or more characters, we can get all words\n", + "# with \"s\" as follows:\n", + "re.findall(r'\\w*s\\w*', story)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['Samy', 'interesting', 'story', 'was', 'last']" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Well, almost! Let's ignore case with \"re.I\" to catch \"Samy\" as well.\n", + "re.findall(r'\\w*s\\w*', story, re.I)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From fc50c105ec63358cbd1e434d94f7ade73da9dd79 Mon Sep 17 00:00:00 2001 From: Muhammad Abdul-Mageed Date: Sat, 11 Mar 2017 22:59:20 -0800 Subject: [PATCH 02/11] Adding processing raw text unit --- processing_raw_text.ipynb | 1010 +++++++++++++++++++++++++++++++++++++ 1 file changed, 1010 insertions(+) create mode 100644 processing_raw_text.ipynb diff --git a/processing_raw_text.ipynb b/processing_raw_text.ipynb new file mode 100644 index 0000000..1bb913c --- /dev/null +++ b/processing_raw_text.ipynb @@ -0,0 +1,1010 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Processing Raw Text" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Also see: \n", + "## http://www.nltk.org/book/ch03.html, https://docs.python.org/2/howto/urllib2.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download a book from Project Gutenberg with Python:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Type of 'response' is :\n", + "Type of 'raw' is :\n" + ] + } + ], + "source": [ + "from urllib2 import Request, urlopen\n", + "\n", + "url=\"http://www.gutenberg.org/files/54255/54255-0.txt\"\n", + "response = urlopen(url)\n", + "raw = response.read().decode('utf8')\n", + "#--------------------------------------------------\n", + "# Check types...\n", + "print(\"Type of \\'response\\' is %s:\")% type(response)\n", + "print(\"Type of \\'raw\\' is %s:\")% type(raw)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The Project Gutenberg EBook of Narrative of Travels in Europe, Asia, and\r\n", + "Africa, in the Seventeenth Century, Volum, by Evliya Çelebi and Joseph Hammer-Purgstall\r\n", + "\r\n" + ] + } + ], + "source": [ + "print(raw[:165])" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['hey', ',', 'guys', ',', 'how', 'is', 'life', '?', '?', '?', '!']\n" + ] + } + ], + "source": [ + "from nltk import word_tokenize\n", + "t=\"hey, guys, how is life???!\"\n", + "tt =word_tokenize(t)\n", + "print(tt)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('hey', 'NN'), (',', ','), ('guys', 'NNS'), (',', ','), ('how', 'WRB'), ('is', 'VBZ'), ('life', 'NN'), ('?', '.'), ('?', '.'), ('?', '.'), ('!', '.')]\n" + ] + } + ], + "source": [ + "ttt = pos_tag(tt)\n", + "print(ttt)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tokenize and pos-tag the text:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "144822\n", + "144822\n" + ] + } + ], + "source": [ + "from nltk import word_tokenize, pos_tag\n", + "#------------------------------\n", + "tokens = word_tokenize(raw)\n", + "print(len(tokens))\n", + "tagged=pos_tag(tokens)\n", + "print(len(tagged))" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[u'\\ufeffThe', u'Project', u'Gutenberg', u'EBook', u'of', u'Narrative', u'of', u'Travels', u'in', u'Europe', u',', u'Asia', u',', u'and', u'Africa', u',', u'in', u'the', u'Seventeenth', u'Century', u',', u'Volum', u',', u'by', u'Evliya', u'\\xc7elebi', u'and', u'Joseph', u'Hammer-Purgstall', u'This', u'eBook', u'is', u'for', u'the', u'use', u'of', u'anyone', u'anywhere', u'in', u'the', u'United', u'States', u'and', u'most', u'other', u'parts', u'of', u'the', u'world', u'at']\n" + ] + } + ], + "source": [ + "print(tokens[:50]) # list of unicode items" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(u'\\ufeffThe', 'NN'), (u'Project', 'NNP'), (u'Gutenberg', 'NNP'), (u'EBook', 'NNP'), (u'of', 'IN'), (u'Narrative', 'NNP'), (u'of', 'IN'), (u'Travels', 'NNP'), (u'in', 'IN'), (u'Europe', 'NNP')]\n" + ] + } + ], + "source": [ + "print(tagged[:10]) # list of tuples (word,pos_tag pairs)" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['hello', 'hi']\n" + ] + } + ], + "source": [ + "wds=[\"hello\", \"hi\", \"life\"]\n", + "h_wds= [w for w in wds if w.startswith(\"h\")]\n", + "\n", + "\n", + "new_words=[]\n", + "for w in wds:\n", + " if w.startswith(\"h\"):\n", + " new_words.append(w)\n", + "print(new_words)" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['plays']\n" + ] + } + ], + "source": [ + "pairs=[ (\"Alex\", \"NN\"), (\"plays\", \"VBZ\") ]\n", + "verbs=[ x[0] for x in pairs if x[1]==\"VBZ\"]\n", + "print(verbs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Note: The pos tagger of course makes mistakes, but it performs reasonably well." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## List comprehension on \"tagged\"" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Project\n", + "Gutenberg\n", + "EBook\n", + "Narrative\n", + "Travels\n", + "Europe\n", + "Asia\n", + "Africa\n", + "Seventeenth\n", + "Century\n", + "Volum\n", + "Evliya\n", + "Çelebi\n", + "Joseph\n", + "Hammer-Purgstall\n", + "United\n", + "Project\n", + "Gutenberg\n", + "License\n", + "United\n", + "Europe\n", + "Asia\n", + "Africa\n", + "Seventeenth\n", + "Century\n", + "II\n", + "Evliya\n", + "Çelebi\n", + "Evliya\n", + "Çelebi\n", + "Joseph\n", + "Hammer-Purgstall\n", + "Release\n", + "Date\n", + "February\n", + "[\n", + "EBook\n", + "Character\n", + "***\n", + "START\n", + "THIS\n", + "PROJECT\n", + "GUTENBERG\n", + "EBOOK\n", + "NARRATIVE\n", + "OF\n", + "TRAVELS\n", + "***\n", + "Produced\n", + "Turgut\n" + ] + } + ], + "source": [ + "# Named enitities:\n", + "ne=[pair[0] for pair in tagged if pair[-1]==\"NNP\"]\n", + "for e in ne[:50]:\n", + " print(e)" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "breadth\n", + "remarkable\n", + "ruby-coloured\n", + "particular\n", + "tombs\n", + "gun-shot’s\n", + "yellow\n", + "rapid\n", + "mild\n", + "mile\n", + "sleep\n", + "legal\n", + "forty-six\n", + "Elephant\n", + "dish\n", + "follow\n", + "abundant\n", + "religious\n", + "washing-tubs\n", + "dreadful\n", + "seventy-seven\n", + "pardon\n", + "hunting\n", + "swam\n", + "outdated\n", + "becas\n", + "mosque\n", + "young\n", + "“Mevlúd-námeh\n", + "underwent\n", + "answered\n", + "tail\n", + "foster\n", + "obstinate\n", + "stable\n", + "suite\n", + "Precious\n", + "farsang’s\n", + "worth\n", + "orderly\n", + "virtuous\n", + "Sheikh-ul-islám\n", + "amorous\n", + "exempt\n", + "www.gutenberg.org\n", + "perishable\n", + "navigable\n", + "limpid\n", + "fat\n", + "father’s\n" + ] + } + ], + "source": [ + "# Adjectives\n", + "adjs= set([pair[0] for pair in tagged if pair[-1]==\"JJ\"]) # we pass the list to set to uniqify\n", + "adjs= list(adjs) #Cast to list again so that we access only few in print\n", + "# Note: 'set' object has no attribute '__getitem__' and so we cannot do adjs[:15] on a set\n", + "for a in adjs[:50]:\n", + " print(a)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "15238\n" + ] + } + ], + "source": [ + "# How many ne?; note these are not uniqified\n", + "print(len(ne))" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1263\n" + ] + } + ], + "source": [ + "# How many uniqe adjs?\n", + "print(len(adjs))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get collocations" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Project Gutenberg-tm; three hundred; hundred houses; Black Sea;\n", + "thousand men; two hundred; one hundred; great number; fifty aspers;\n", + "next day; Project Gutenberg; Uzún Hassan; three days; thousand houses;\n", + "five hours; Sultán Murad; Ahmed Páshá; Kizil Irmák; five hundred;\n", + "Mustafa Páshá\n" + ] + } + ], + "source": [ + "from nltk import Text\n", + "text=Text(tokens)\n", + "#print(type(text))\n", + "text.collocations()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Accessing webpages/html" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " Sh\n" + ] + } + ], + "source": [ + "from bs4 import BeautifulSoup\n", + "url=\"http://www.bbc.com/news/technology-38892383\"\n", + "response = urlopen(url)\n", + "html = response.read().decode('utf8')\n", + "print(html[:200])" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shopping\n", + "robots\n", + "on\n", + "the\n", + "march\n", + "in\n", + "Ocado\n", + "-\n", + "BBC\n", + "News\n" + ] + } + ], + "source": [ + "raw = BeautifulSoup(html, \"lxml\").get_text()\n", + "tokens = word_tokenize(raw)\n", + "tok=tokens[:10]\n", + "for t in tok:\n", + " print(t)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Working with unicode" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2167789138\tمعَ فجر العام الجديد : رجوتُ إلهيَ أن يجعلني ويجعلكمِ من أسعدِ خلقهِ ، و يرزقني ويرزقكم أضعاافَ أمنيآتِكم حتَى ترضون ...صباحكم رضى||$||\"@jumana_sj2: ولا اقول عاادي كمان يدخلو اغاني كوريه خنضحك #عشاق_كوريا_يطالبون_قناة_mbc_بفتح_قناه_mbc_korea_مترجمه_بالعربيه_للمعجبين_العرب\"||$||\"@s_h_osho: نبي قناه كوريه ليش فيه قناه هنديه ومافيه كوريه؟ #عشاق_كوريا_يطالبون_قناة_mbc_بفتح_قناه_mbc_korea_مترجمه_بالعربيه_للمعجبين_العرب\"||$||\"@LINA_ALADEEB: بعيداً عن خيالات الحب احياناً السعاده تكون عباره ع\n" + ] + } + ], + "source": [ + "import codecs\n", + "ara_text=codecs.open(\"sample_concat.tsv\", \"r\", \"utf-8\").readlines()[0]\n", + "print(ara_text[:500])" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2167789138\tمع فجر العام الجديد : رجوت إلهي أن يجعلني ويجعلكم من أسعد خلقه ، و يرزقني ويرزقكم أضعااف أمنيآتكم حتى ترضون ...صباحكم رضى||$||\"@jumana_sj2: ولا اقول عاادي كمان يدخلو اغاني كوريه خنضحك #عشاق_كوريا_يطالبون_قناة_mbc_بفتح_قناه_mbc_korea_مترجمه_بالعربيه_للمعجبين_العرب\"||$||\"@s_h_osho: نبي قناه كوريه ليش فيه قناه هنديه ومافيه كوريه؟ #عشاق_كوريا_يطالبون_قناة_mbc_بفتح_قناه_mbc_korea_مترجمه_بالعربيه_للمعجبين_العرب\"||$||\"@LINA_ALADEEB: بعيدا عن خيالات الحب احيانا السعاده تكون عباره عن - برنامج \n" + ] + } + ], + "source": [ + "def remove_unicode_diac(text):\n", + " \"\"\"Takes Arabic in utf-8 and returns same text without diac\"\"\"\n", + " # Replace diacritics with nothing \n", + " text = text.replace(u\"\\u064B\", \"\")# fatHatayn\n", + " text = text.replace(u\"\\u064C\", \"\") # Dammatayn\n", + " text = text.replace(u\"\\u064D\", \"\")# kasratayn\n", + " text = text.replace(u\"\\u064E\", \"\")# fatHa\n", + " text = text.replace(u\"\\u064F\", \"\") # Damma\n", + " text = text.replace(u\"\\u0650\", \"\")# kasra\n", + " text = text.replace(u\"\\u0651\", \"\")# shaddah\n", + " text = text.replace(u\"\\u0652\", \"\")# sukuun\n", + " text = text.replace(u\"\\u0670\", \"`\") # dagger 'alif\n", + " return text\n", + "\n", + "ara_text_no_diac =remove_unicode_diac(ara_text)\n", + "print(ara_text_no_diac[:500])" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<type 'unicode'>\n" + ] + } + ], + "source": [ + "print(type(ara_text_no_diac))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Regular expressions preview!" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hey there, take a look: <URL> #love_robots!\n" + ] + } + ], + "source": [ + "import re\n", + "# This will replace the URL \"http://www.bbc.com/news/technology-38892383\" with a string token \"<URL>\"\n", + "tweet=\"Hey there, take a look: http://www.bbc.com/news #love_robots!\"\n", + "tweet = re.sub(r'https?://[^\\s<>\"]+|www\\.[^\\s<>\"]+', '<URL>',tweet)\n", + "print(tweet)" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['take']\n" + ] + } + ], + "source": [ + "e_ending=[w for w in tweet.split() if re.search('e$', w)]\n", + "print(e_ending) # Note that \"there,\" ends in \",\"" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['there', 'take']\n" + ] + } + ], + "source": [ + "import string\n", + "punc = [char for char in string.punctuation]\n", + "def clean_punc(punc, text):\n", + " for i in punc:\n", + " text=text.replace(i, \"\")\n", + " return text\n", + "\n", + "tweet=clean_punc(punc, tweet)\n", + "e_ending=[w for w in tweet.split() if re.search('e$', w)]\n", + "print(e_ending) # Note that \"there,\" ends in \",\"" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['!', '\"', '#', '$', '%', '&', \"'\", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\\\', ']', '^', '_', '`', '{', '|', '}', '~']\n" + ] + } + ], + "source": [ + "print(punc)" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, 'hey people')\n", + "(1, 'how are you?')\n", + "(2, 'life is good!')\n" + ] + } + ], + "source": [ + "alldata=[\"hey people\", \"how are you?\", \"life is good!\"]\n", + "for line_no, line in enumerate(alldata):\n", + " print(line_no, line)" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as', '\"', 'teachers', '\"', '.', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', \"high's\", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '\"', 'teachers', '\"', '.', 'the', 'scramble', 'to', 'survive', 'financially', ',', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', \"teachers'\", 'pomp', ',', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', ',', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students', '.', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', ',', 'i', 'immediately', 'recalled', '.', '.', '.', '.', '.', '.', '.', '.', '.', 'at', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', 'high', '.', 'a', 'classic', 'line', ':', 'inspector', ':', \"i'm\", 'here', 'to', 'sack', 'one', 'of', 'your', 'teachers', '.', 'student', ':', 'welcome', 'to', 'bromwell', 'high', '.', 'i', 'expect', 'that', 'many', 'adults', 'of', 'my', 'age', 'think', 'that', 'bromwell', 'high', 'is', 'far', 'fetched', '.', 'what', 'a', 'pity', 'that', 'it', \"isn't\", '!']\n" + ] + } + ], + "source": [ + "line=\"\"\"_*0 bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life , such as \" teachers \" . my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is \" teachers \" . the scramble to survive financially , the insightful students who can see right through their pathetic teachers' pomp , the pettiness of the whole situation , all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school , i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line : inspector : i'm here to sack one of your teachers . student : welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn't ! \"\"\"\n", + "line.split()[0]\n", + "words=line.split()[1:]\n", + "print(words)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "space=defaultdict(int)\n", + "for w in words:\n", + " space[w]=len(space)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "defaultdict(int,\n", + " {'!': 96,\n", + " '\"': 41,\n", + " ',': 74,\n", + " '.': 93,\n", + " '35': 25,\n", + " ':': 85,\n", + " 'a': 94,\n", + " 'about': 17,\n", + " 'adults': 88,\n", + " 'age': 89,\n", + " 'all': 59,\n", + " 'and': 64,\n", + " 'as': 22,\n", + " 'at': 76,\n", + " 'believe': 33,\n", + " 'bromwell': 91,\n", + " 'burn': 72,\n", + " 'can': 47,\n", + " 'cartoon': 4,\n", + " 'classic': 76,\n", + " 'closer': 38,\n", + " 'comedy': 5,\n", + " 'down': 73,\n", + " 'episode': 67,\n", + " 'expect': 86,\n", + " 'far': 91,\n", + " 'fetched': 92,\n", + " 'financially': 43,\n", + " 'here': 81,\n", + " 'high': 91,\n", + " \"high's\": 35,\n", + " 'i': 86,\n", + " \"i'm\": 80,\n", + " 'immediately': 74,\n", + " 'in': 68,\n", + " 'insightful': 44,\n", + " 'inspector': 79,\n", + " 'is': 91,\n", + " \"isn't\": 95,\n", + " 'it': 95,\n", + " 'knew': 63,\n", + " 'lead': 30,\n", + " 'life': 19,\n", + " 'line': 77,\n", + " 'many': 87,\n", + " 'me': 61,\n", + " 'much': 37,\n", + " 'my': 89,\n", + " 'of': 89,\n", + " 'one': 83,\n", + " 'other': 15,\n", + " 'pathetic': 52,\n", + " 'pettiness': 55,\n", + " 'pity': 94,\n", + " 'pomp': 54,\n", + " 'profession': 29,\n", + " 'programs': 16,\n", + " 'ran': 8,\n", + " 'reality': 39,\n", + " 'recalled': 75,\n", + " 'remind': 60,\n", + " 'repeatedly': 70,\n", + " 'right': 49,\n", + " 'sack': 82,\n", + " 'same': 11,\n", + " 'satire': 36,\n", + " 'saw': 66,\n", + " 'school': 74,\n", + " 'schools': 61,\n", + " 'scramble': 41,\n", + " 'see': 48,\n", + " 'situation': 58,\n", + " 'some': 14,\n", + " 'student': 85,\n", + " 'students': 65,\n", + " 'such': 21,\n", + " 'survive': 42,\n", + " 'teachers': 85,\n", + " \"teachers'\": 53,\n", + " 'teaching': 28,\n", + " 'than': 40,\n", + " 'that': 95,\n", + " 'the': 74,\n", + " 'their': 65,\n", + " 'think': 90,\n", + " 'through': 50,\n", + " 'time': 12,\n", + " 'to': 86,\n", + " 'tried': 71,\n", + " 'welcome': 85,\n", + " 'what': 93,\n", + " 'when': 65,\n", + " 'which': 68,\n", + " 'who': 46,\n", + " 'whole': 57,\n", + " 'years': 26,\n", + " 'your': 84})" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "space" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", + " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", + " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", + " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", + " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", + " 0. 0. 0. 0. 0. 0. 0.]\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "vec = np.zeros(len(space))\n", + "print(vec)" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 1. 1. 1. 1.\n", + " 0. 1. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 1. 0. 0. 1. 0. 1.\n", + " 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1.\n", + " 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1.\n", + " 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.\n", + " 1. 1. 1. 1. 1. 1. 1.]\n" + ] + } + ], + "source": [ + "for w in words:\n", + " vec[space[w]]=1\n", + "print(vec)" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "abc\n", + "cd\n" + ] + } + ], + "source": [ + "x=[\"a\", \"ab\", \"abc\", \"cd\", \"xxx\"]\n", + "for i in x:\n", + " if \"c\" in i:\n", + " print(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['abc', 'cd']\n" + ] + } + ], + "source": [ + "c_list=[i for i in x if \"c\" in i]\n", + "print(c_list)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From c4ee27b90609780fb3b5359e9d53870ea4f287f2 Mon Sep 17 00:00:00 2001 From: Muhammad Abdul-Mageed <mumageed@gmail.com> Date: Sat, 11 Mar 2017 23:05:06 -0800 Subject: [PATCH 03/11] Update README.md Just updates README some! --- README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index a97e86f..44fea55 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,18 @@ # A Python Tutorial: -This is code I wrote for courses I teach at Indiana University. -The first parts of the code in this tutorial are meant for Python beginners, and the code grows more advanced as you advance through the later parts. +This is code I wrote for courses I taught at Indiana University and then University of British Columbia. +The first parts of the code in this tutorial are meant for Python beginners, and the code grows more advanced as in later parts. -In the context of this tutorial, I plan to include sections covering the Natural Language Toolkit (NLTK), gensim, scikit-learn, visualization, numpy, etc. +In the context of this tutorial, I have added sections covering processing text, use of the Natural Language Toolkit (NLTK), gensim, scikit-learn. I plan to add parts on visualization, numpy, etc. In addition, I plan to add more advanced code covering practical machine learning issues like vector space models to perform certain tasks like sentiment analysis. Finally, I also plan to introduce some deep learning tools and provide some relevant code. -The courses teach skills for at the intersection of fields like natural language processing, machine learning, social media mining, text mining, data science, etc. +The courses teach skills data science skills (i.e, skills at the intersection of natural language processing, applied machine learning, and social media mining). The code is written primarily in Python 2.7. A migration to Python 3 shoul be straightforward. + Some of the code is written and run during class sessions and so it is shared without much polishing. +In some places, you may find some repetition (primarily for pedagogical purposes inside class). I provide some comments, before I push here, as much as I can. From a0942281d8ea29850e387bc045eae296b8b9c0c4 Mon Sep 17 00:00:00 2001 From: Muhammad Abdul-Mageed <mumageed@gmail.com> Date: Sun, 19 Mar 2017 10:20:07 -0700 Subject: [PATCH 04/11] Add files via upload Simplifying the vector_space tutorial. --- python_tutorial_part_6_vector_space.ipynb | 370 ++++++++++++++++------ 1 file changed, 272 insertions(+), 98 deletions(-) diff --git a/python_tutorial_part_6_vector_space.ipynb b/python_tutorial_part_6_vector_space.ipynb index 21c002e..325da56 100644 --- a/python_tutorial_part_6_vector_space.ipynb +++ b/python_tutorial_part_6_vector_space.ipynb @@ -21,9 +21,173 @@ "# distribution of movie review sentiment data." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## namedtuple" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "female\n", + "Visual Arts\n" + ] + } + ], + "source": [ + "# While Python tuples is indexed numerically (like a list), a named tuple assigns names to fields and \n", + "# is also indexed numerically. This makes it possible to access the fields in a named tuple using these names\n", + "# as if they were attributes of an object (via dotting into the namedtuple)\n", + "# See also here: https://docs.python.org/2/library/collections.html\n", + "from collections import namedtuple\n", + "Student = namedtuple(\"Student\", [\"name\", \"age\", \"gender\", \"course\"])\n", + "#--------------------------------------------------------------------\n", + "# Note: You can also provide field names as a space-delimited string, rather than a list.\n", + "#Student = namedtuple(\"Student\", \"name age gender course\")\n", + "#--------------------------------------------------------------------\n", + "\n", + "angela=Student(name=\"Angela\", age=45, gender=\"female\", course=\"Python\")\n", + "soha=Student(name=\"Soha\", age=25, gender=\"female\", course=\"Visual Arts\")\n", + "print(angela.gender)\n", + "print(soha.course)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Soha\n", + "25\n", + "female\n", + "Visual Arts\n" + ] + } + ], + "source": [ + "# A namedtuple is also iterable like a tuple\n", + "for i in soha:\n", + " print(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Visual Arts'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can access a namedtuple the same way you access a tuple or a list:\n", + "soha[-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Student(name='Angela', age=45, gender='female', course='Python')\n", + "Student(name='Soha', age=25, gender='female', course='Visual Arts')\n" + ] + } + ], + "source": [ + "# We can now create a list where we append the two namedtuples above.\n", + "# i.e., a list of namedtuples\n", + "all_students=[]\n", + "all_students.append(angela)\n", + "all_students.append(soha)\n", + "for s in all_students:\n", + " print(s)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "- Angela is 45 years old.\n", + "- Soha is 25 years old.\n" + ] + } + ], + "source": [ + "for s in all_students:\n", + " print(\"- {} is {} years old.\").format(s.name, s.age)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# We should usually get tags automatically based on input data file.\n", + "# In the input data file we have, we know that the first 12500 data points are positive/1.0 and the next 12500 are\n", + "# negative/0.0 then the next 12500 is poitive and the fourth chunk is negative.\n", + "# So basically the train_data has 25K (with the first half positive and the second half negative)\n", + "# and test_data with the same setup for class label. \n", + "# The rest of the data in the file is unknown/neutral/-1 and we don't use that part.\n", + "\n", + "def map_tags(post_index):\n", + " # if post is positive, tag=1, if it is negative tag=0, if it is neutral, tag=-1\n", + " tag=-1\n", + " if post_index < 12500:\n", + " tag=1\n", + " elif post_index < 25000:\n", + " tag=0\n", + " elif post_index < 37500:\n", + " tag=1\n", + " elif post_index < 50000:\n", + " tag=0\n", + " return tag" + ] + }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 79, "metadata": { "collapsed": false }, @@ -32,37 +196,52 @@ "name": "stdout", "output_type": "stream", "text": [ - "25000\n", - "200\n", - "200\n" + "50000\n", + "**************************************************\n", + "DataDoc(tag=1, words=['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as', '\"', 'teachers', '\"', '.', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', \"high's\", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '\"', 'teachers', '\"', '.', 'the', 'scramble', 'to', 'survive', 'financially', ',', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', \"teachers'\", 'pomp', ',', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', ',', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students', '.', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', ',', 'i', 'immediately', 'recalled', '.', '.', '.', '.', '.', '.', '.', '.', '.', 'at', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', 'high', '.', 'a', 'classic', 'line', ':', 'inspector', ':', \"i'm\", 'here', 'to', 'sack', 'one', 'of', 'your', 'teachers', '.', 'student', ':', 'welcome', 'to', 'bromwell', 'high', '.', 'i', 'expect', 'that', 'many', 'adults', 'of', 'my', 'age', 'think', 'that', 'bromwell', 'high', 'is', 'far', 'fetched', '.', 'what', 'a', 'pity', 'that', 'it', \"isn't\", '!'])\n", + "**************************************************\n" ] } ], "source": [ "from collections import namedtuple\n", "\n", - "all_data = [] \n", - "DataDoc= namedtuple('DataDoc', 'tag words')\n", - "with open('/Users/mam/CORE/RESEARCH/DEEPLEARNING/Doc2Vec/data/aclImdb/alldata-id.txt') as alldata:\n", - " for line_no, line in enumerate(alldata):\n", - " label=line.split()[0]\n", - " word_list=line.lower().split()[1:]\n", - " all_data.append(DataDoc(label, word_list))\n", - " #print my_data[line_no]\n", - " #break\n", - "train_data = all_data[:25000]\n", - "test_data = all_data[25000:50000]\n", - "print len(train_data)\n", + "def get_all_data():\n", + " \"\"\"\n", + " Returns a list of namedtuples from the IMDB file.\n", + " Each namedtuple has two named fields:\n", + " tag= class label (0 for \"negative\" and 1 for \"positive\")\n", + " word_list the list of words in the review\n", + " \"\"\"\n", + " # a list to house all the data\n", + " all_data = [] \n", + " \n", + " DataDoc= namedtuple('DataDoc', 'tag words')\n", + " with open('/Users/mam/CORE/RESEARCH/DEEPLEARNING/Doc2Vec/data/aclImdb/alldata-id.txt') as alldata:\n", + " for line_no, line in enumerate(alldata):\n", + " post_index=int(line.split()[0].split(\"*\")[-1])\n", + " label=map_tags(post_index)\n", + " word_list=line.lower().split()[1:]\n", + " all_data.append(DataDoc(label, word_list))\n", + " return all_data\n", "\n", - "train_data=train_data[:100]+train_data[12500:12600]\n", - "test_data=test_data[:100]+test_data[12500:12600]\n", - "print len(train_data)\n", - "print len(test_data)" + "# Call the function to get the data\n", + "all_data= get_all_data()\n", + "# The data are 100K reviews as explained earlier\n", + "# Since the last 50K are unknown, let's throw them away\n", + "all_data=all_data[:50000]\n", + "print(len(all_data))\n", + "print(\"*\"*50)\n", + "# print the first namedtuple\n", + "print(all_data[0])\n", + "print(\"*\"*50)\n", + "# print the last namedtuple\n", + "#print(all_data[-1])" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 80, "metadata": { "collapsed": false }, @@ -71,8 +250,35 @@ "name": "stdout", "output_type": "stream", "text": [ - "7142\n", - "6994\n" + "500\n" + ] + } + ], + "source": [ + "# The data set is big, and we want to only work with a very small sample of it.\n", + "# Let's randomize the reviews and then take only 500 of them and call them train_data.\n", + "# We will then do cross-validation on these later.\n", + "from random import shuffle\n", + "shuffle(all_data)\n", + "#-------------------------\n", + "train_data = all_data[:500]\n", + "#------------------------\n", + "print len(train_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "13848\n", + "13828\n" ] } ], @@ -105,7 +311,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 82, "metadata": { "collapsed": false }, @@ -114,9 +320,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "0\n", - "200\n", - "200\n" + "0 [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n" ] } ], @@ -138,18 +342,15 @@ " \n", "\n", "train_vecs= [get_sparse_vec(data_point, word_space) for data_point in train_data]\n", - "test_vecs= [get_sparse_vec(data_point, word_space) for data_point in test_data]\n", - "#test_vecs= get_sparse_vectors(test_data, word_space)\n", - "\n", - "#print train_vecs, test_vecs[0]\n", - "print len(train_data[12500:12600])\n", - "print len(train_vecs)\n", - "print len(test_vecs)" + "# Get class labels\n", + "train_tags=[train_data[i].tag for i in range(len(train_data))]\n", + "# Let's look at the last training data point\n", + "print train_tags[-1], train_vecs[-1][:10]" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 83, "metadata": { "collapsed": false }, @@ -158,54 +359,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "0.0 [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", - "200\n", - "200\n" - ] - } - ], - "source": [ - "# We should usually get tags automatically based on input data file.\n", - "# In the input data file we have, we know that the first 12500 data points are positive/1.0 and the next 12500 are\n", - "# negative/0.0 then the next 12500 is poitive and the fourth chunk is negative.\n", - "# So basically the train_data has 25K (with the first half positive and the second half negative)\n", - "# and test_data with the same setup for class label. \n", - "# The rest of the data in the file is unknown and we don't use that part.\n", - "# We could write code to extract label automatically and we will do this based on a standardized format we will work with\n", - "# later, for now we will hard-code the labels.\n", - "\n", - "from random import shuffle, randint\n", - "\n", - "\n", - "train_tags=[ 1.0 for i in range(100)] + [ 0.0 for i in range(100)]\n", - "test_tags=[ 1.0 for i in range(100)] + [ 0.0 for i in range(100)]\n", - "\n", - "\n", - "#train_tags=[ 1.0 for i in range(12500)] + [ 0.0 for i in range(12500)]\n", - "#test_tags=[ 1.0 for i in range(12500)] + [ 0.0 for i in range(12500)]\n", - "# Side note: If the first token in each line were the tag, we could get tags as follows:\n", - "# tags= [train_data[i].tag for i in range(len(train_data))]\n", - "print train_tags[-1], train_vecs[-1][:10]\n", - "print len(train_tags)\n", - "print len(test_tags)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(200, 7142)\n" + "(500, 13848)\n" ] } ], "source": [ + "# scikit-learn likes to take data as numpy arrays. So, let's change our data accordingly:\n", "train_vecs=np.array(train_vecs)\n", "train_tags=np.array(train_tags)\n", "print train_vecs.shape" @@ -213,7 +372,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 84, "metadata": { "collapsed": false }, @@ -234,32 +393,25 @@ "================================================== \n", "\n", "********************\n", - "\t accuracy_score\t0.715\n", + "\t accuracy_score\t0.644\n", "********************\n", - "precision_score\t0.765432098765\n", - "recall_score\t0.62\n", + "precision_score\t0.657692307692\n", + "recall_score\t0.657692307692\n", "\n", "classification_report:\n", "\n", " precision recall f1-score support\n", "\n", - " 0.0 0.68 0.81 0.74 100\n", - " 1.0 0.77 0.62 0.69 100\n", + " 0 0.63 0.63 0.63 240\n", + " 1 0.66 0.66 0.66 260\n", "\n", - "avg / total 0.72 0.71 0.71 200\n", + "avg / total 0.64 0.64 0.64 500\n", "\n", "\n", "confusion_matrix:\n", "\n", - "[[81 19]\n", - " [38 62]]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using gpu device 0: GeForce GT 750M\n" + "[[151 89]\n", + " [ 89 171]]\n" ] } ], @@ -319,8 +471,30 @@ "print \"precision_score\\t\", metrics.precision_score(train_tags, predicted)\n", "print \"recall_score\\t\", metrics.recall_score(train_tags, predicted)\n", "print \"\\nclassification_report:\\n\\n\", metrics.classification_report(train_tags, predicted)\n", - "print \"\\nconfusion_matrix:\\n\\n\", metrics.confusion_matrix(train_tags, predicted)\n", - " \n" + "print \"\\nconfusion_matrix:\\n\\n\", metrics.confusion_matrix(train_tags, predicted)" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.52\n" + ] + } + ], + "source": [ + "# Usually, we calculate a basline as the majority class in training data.\n", + "# Here, to simplify, we just get the majority class in all the data (see support, which is the number of data points in each\n", + "# class, in the classification report above)\n", + "majority_class=260/500.0\n", + "print(majority_class)" ] } ], @@ -340,7 +514,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.10" + "version": "2.7.12" } }, "nbformat": 4, From 84dd3d3773a94b432987508b2dc714566a4eda75 Mon Sep 17 00:00:00 2001 From: Muhammad Abdul-Mageed <mumageed@gmail.com> Date: Sun, 19 Mar 2017 10:26:05 -0700 Subject: [PATCH 05/11] Add files via upload Adding some explanations to the tutorial... --- python_tutorial_part_6_vector_space.ipynb | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/python_tutorial_part_6_vector_space.ipynb b/python_tutorial_part_6_vector_space.ipynb index 325da56..03f48e1 100644 --- a/python_tutorial_part_6_vector_space.ipynb +++ b/python_tutorial_part_6_vector_space.ipynb @@ -18,7 +18,8 @@ "outputs": [], "source": [ "# This is code to build a vector space model, with SVMs on Andrew Mass' \n", - "# distribution of movie review sentiment data." + "# distribution of movie review sentiment data.\n", + "# Since we use Python's namedtuple on the code, let's take a look at what a namedtuple is first" ] }, { @@ -170,6 +171,20 @@ "# So basically the train_data has 25K (with the first half positive and the second half negative)\n", "# and test_data with the same setup for class label. \n", "# The rest of the data in the file is unknown/neutral/-1 and we don't use that part.\n", + "#------------------------------------------\n", + "# Format of the data is as below, with each line starting with an index.\n", + "# For example, \"_*0\" is the index in the first line. We will ignore the \"_*\" part and cast the index into\n", + "# an int\n", + "#------------------------------------------\n", + "\"\"\"\n", + "_*0 bromwell high is a cartoon comedy ....\n", + "_*1 homelessness ( or houselessness as george carlin stated )...\n", + "_*2 brilliant over-acting by lesley ann warren .\n", + "\"\"\"\n", + "#------------------------------------------\n", + "# Let's build a function that takes the index in the file and returns a numerical index that can be seen \n", + "# by the classifier we will use later\n", + "#------------------------------------------\n", "\n", "def map_tags(post_index):\n", " # if post is positive, tag=1, if it is negative tag=0, if it is neutral, tag=-1\n", From 5594236ab7cd506bcbd5d70c17ec3a66fdbc0c69 Mon Sep 17 00:00:00 2001 From: Muhammad Abdul-Mageed <mumageed@gmail.com> Date: Mon, 20 Mar 2017 13:13:23 -0700 Subject: [PATCH 06/11] Add files via upload Further cleaning... --- python_tutorial_part_6_vector_space.ipynb | 176 ++++++++++++++++++++-- 1 file changed, 165 insertions(+), 11 deletions(-) diff --git a/python_tutorial_part_6_vector_space.ipynb b/python_tutorial_part_6_vector_space.ipynb index 03f48e1..7bb6c5a 100644 --- a/python_tutorial_part_6_vector_space.ipynb +++ b/python_tutorial_part_6_vector_space.ipynb @@ -195,11 +195,58 @@ " tag=0\n", " elif post_index < 37500:\n", " tag=1\n", - " elif post_index < 50000:\n", - " tag=0\n", + " else:\n", + " pass\n", " return tag" ] }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, 'bromwell high is a cartoon comedy ....')\n", + "(1, 'homelessness ( or houselessness as george carlin stated )')\n", + "(2, 'brilliant over-acting by lesley ann warren')\n" + ] + } + ], + "source": [ + "l=[\"bromwell high is a cartoon comedy ....\", \\\n", + " \"homelessness ( or houselessness as george carlin stated )\",\\\n", + " \"brilliant over-acting by lesley ann warren\"]\n", + "\n", + "for no, post in enumerate(l):\n", + " print(no, post)" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<type 'int'>\n" + ] + } + ], + "source": [ + "line=\"_*0 bromwell high is a cartoon comedy ....\"\n", + "label= int(line.split()[0].split(\"*\")[-1])\n", + "print(type(label))" + ] + }, { "cell_type": "code", "execution_count": 79, @@ -231,11 +278,11 @@ " # a list to house all the data\n", " all_data = [] \n", " \n", - " DataDoc= namedtuple('DataDoc', 'tag words')\n", + " DataDoc= namedtuple('DataDoc', ['tag', 'words'])\n", " with open('/Users/mam/CORE/RESEARCH/DEEPLEARNING/Doc2Vec/data/aclImdb/alldata-id.txt') as alldata:\n", " for line_no, line in enumerate(alldata):\n", - " post_index=int(line.split()[0].split(\"*\")[-1])\n", - " label=map_tags(post_index)\n", + " #post_index=int(line.split()[0].split(\"*\")[-1])\n", + " label=map_tags(line_no)\n", " word_list=line.lower().split()[1:]\n", " all_data.append(DataDoc(label, word_list))\n", " return all_data\n", @@ -316,7 +363,7 @@ " for w in doc.words:\n", " # indexes of words won't be in sequential order as they occur in data (can you tell why?), \n", " # but that doesn't matter.\n", - " word_space[w]=len(word_space)\n", + " word_space[w]=len(word_space+1)\n", " return word_space\n", "\n", "word_space=get_space(train_data)\n", @@ -326,7 +373,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 103, "metadata": { "collapsed": false }, @@ -335,13 +382,109 @@ "name": "stdout", "output_type": "stream", "text": [ - "0 [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n" + "[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", + "[ 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]\n" ] } ], "source": [ "import numpy as np\n", - "\n", + "x=np.zeros(10)\n", + "print(x)\n", + "x[3]=1\n", + "print(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0. 0. 0. ..., 0. 0. 0.]\n" + ] + } + ], + "source": [ + "big=np.zeros(len(word_space))\n", + "print(big)" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "13828\n" + ] + } + ], + "source": [ + "w=\"love\"\n", + "word_index=word_space[w]\n", + "print(word_index)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "big[13828]=1" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[10, 20, 33, 44, 2, 6]\n" + ] + } + ], + "source": [ + "numbers=[10, 20, 33, 44, 50, 2, 6, 77]\n", + "less_than_fifty= [i for i in numbers if i < 50]\n", + "print(less_than_fifty)" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n", + "[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n" + ] + } + ], + "source": [ "def get_sparse_vec(data_point, space):\n", " # create empty vector\n", " sparse_vec = np.zeros((len(space)))\n", @@ -349,7 +492,8 @@ " # use exception handling such that this function can also be used to vectorize \n", " # data with words not in train (i.e., test and dev data)\n", " try:\n", - " sparse_vec[space[w]]=1\n", + " word_index= space[w]\n", + " sparse_vec[word_index]=1\n", " except:\n", " continue\n", " return sparse_vec\n", @@ -360,9 +504,19 @@ "# Get class labels\n", "train_tags=[train_data[i].tag for i in range(len(train_data))]\n", "# Let's look at the last training data point\n", - "print train_tags[-1], train_vecs[-1][:10]" + "print(train_tags[-1])\n", + "print(train_vecs[-1][:10])" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": 83, From 71f94fced32bc1edb535fa4872d97dcddd9945e0 Mon Sep 17 00:00:00 2001 From: Muhammad Abdul-Mageed <mumageed@gmail.com> Date: Fri, 24 Mar 2017 10:05:18 -0700 Subject: [PATCH 07/11] Adding a unit on classes. --- classes.ipynb | 317 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 317 insertions(+) create mode 100644 classes.ipynb diff --git a/classes.ipynb b/classes.ipynb new file mode 100644 index 0000000..5440c5d --- /dev/null +++ b/classes.ipynb @@ -0,0 +1,317 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# A Quick Look at Python Classes" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "678678\n", + "Alex\n", + "['Python', 'Soical Media Intelligence']\n", + "Alex has uid 678678, and is taking: ['Python', 'Soical Media Intelligence']\n", + "\n", + "\n", + " A student class, holding name, id, and courses taken...\n", + " \n" + ] + } + ], + "source": [ + "class Student(object):\n", + " \"\"\"\n", + " A student class, holding name, id, and courses taken...\n", + " \"\"\"\n", + " def __init__(self, name, student_id, courses):\n", + " self.name=name\n", + " self.student_id = student_id\n", + " self.courses = courses\n", + "\n", + " def get_id(self):\n", + " return self.student_id\n", + " \n", + " def get_name(self):\n", + " return self.name\n", + "\n", + " def get_courses(self):\n", + " return self.courses\n", + "\n", + " def __str__(self):\n", + " return \"%s has uid %s, and is taking: %s\\n\" % (self.name, self.student_id, self.courses)\n", + "\n", + "#--------------------------------------------------------------------\n", + "alex=Student(\"Alex\", 678678, [\"Python\", \"Soical Media Intelligence\"])\n", + "print(alex.get_id())\n", + "print(alex.get_name())\n", + "print(alex.get_courses())\n", + "#----------\n", + "print(alex)\n", + "print(Student.__doc__)\n", + "#---------------------------------------------------------------------" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7675456353\n", + "Sara\n", + "Deep Learning\n", + "Sara has uid 7675456353, and is taking: Deep Learning\n", + "\n" + ] + } + ], + "source": [ + "#--------------------------------------------------------------------\n", + "sara=Student(\"Sara\", 7675456353, \"Deep Learning\")\n", + "print(sara.get_id())\n", + "print(sara.get_name())\n", + "print(sara.get_courses())\n", + "#----------\n", + "#print(sara)\n", + "#---------------------------------------------------------------------" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "- Title of the Book: Deep Learning for NLP\n", + "- Price of the Book: 100\n", + "- New price of the Book: 180\n" + ] + } + ], + "source": [ + "class Book(object):\n", + " \"\"\"\n", + " A Book class with some getters! (Bad doc!)\n", + " \"\"\"\n", + " def __init__(self, title, b_id, price):\n", + " self.title = title\n", + " self.b_id = b_id\n", + " self.price = price\n", + "\n", + " def get_id(self):\n", + " return self.b_id\n", + " \n", + " def get_title(self):\n", + " return self.title\n", + "\n", + " def get_price(self):\n", + " return self.price\n", + " \n", + " def update_price(self, price):\n", + " self.price =price\n", + " \n", + "deep_learning=Book(\"Deep Learning for NLP\", \"888-22-33308\", 100)\n", + "\n", + "print('- Title of the Book: {}').format(deep_learning.get_title())\n", + "print('- Price of the Book: {}').format(deep_learning.get_price())\n", + "# Update the price\n", + "deep_learning.update_price(180)\n", + "print('- New price of the Book: {}').format(deep_learning.get_price())" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "- Title of the Book: Deep Learning for NLP\n", + "- Price of the Book: 100\n", + "- New price of the Book: 180\n" + ] + } + ], + "source": [ + "class Book(object):\n", + " \"\"\"\n", + " A Book class with some getters! (Bad doc!)\n", + " \"\"\"\n", + " def __init__(self, title, b_id, price):\n", + " self.title = title\n", + " self.b_id = b_id\n", + " self.price = price\n", + "\n", + "# def get_id(self):\n", + "# return self.b_id\n", + " \n", + "# def get_title(self):\n", + "# return self.title\n", + "\n", + "# def get_price(self):\n", + "# return self.price\n", + " \n", + " def update_price(self, new_price):\n", + " self.price = new_price\n", + " \n", + "deep_learning=Book(\"Deep Learning for NLP\", \"888-22-33308\", 100)\n", + "\n", + "print('- Title of the Book: {}').format(deep_learning.title)\n", + "print('- Price of the Book: {}').format(deep_learning.price)\n", + "# Update the price\n", + "deep_learning.update_price(180)\n", + "print('- New price of the Book: {}').format(deep_learning.price)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " A Book class with some getters! (Bad doc!)\n", + " \n" + ] + } + ], + "source": [ + "print(deep_learning.__doc__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Subclassing" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "- Issue of the Magazine: 12-2\n", + "- Title of the Magazine: Time\n", + "- Price of the Magazine: 10\n", + "- New price of the Magazine: 15\n" + ] + } + ], + "source": [ + "class Magazine(Book):\n", + " \"\"\"\n", + " Subclass of the Book class...\n", + " Only adds the method to get issue info.\n", + " \"\"\"\n", + " def __init__(self, title, b_id, price, issue):\n", + " # Just invoke the __init__ for the parent class\n", + " Book.__init__(self, title, b_id, price)\n", + " self.issue = issue\n", + "\n", + " def get_issue(self):\n", + " return self.issue\n", + " \n", + "time=Magazine(\"Time\", \"000-22-4444\", 10, \"12-2\")\n", + "\n", + "print('- Issue of the Magazine: {}').format(time.get_issue())\n", + "#-----------------------------------------------------------\n", + "# Everything else works like it should with the parent class\n", + "print('- Title of the Magazine: {}').format(time.get_title())\n", + "print('- Price of the Magazine: {}').format(time.get_price())\n", + "# Update the price\n", + "time.update_price(15)\n", + "print('- New price of the Magazine: {}').format(time.get_price())" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Subclass of the Book class...\n", + " Only adds the method to get issue info.\n", + " \n" + ] + } + ], + "source": [ + "# Note: Subclass does not inherit doc from parent class:\n", + "print(time.__doc__)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 1189f8366fae05a24b5a5b2cb3746f6455b0eec1 Mon Sep 17 00:00:00 2001 From: Muhammad Abdul-Mageed <mumageed@gmail.com> Date: Mon, 27 Mar 2017 15:19:30 -0700 Subject: [PATCH 08/11] Add files via upload Text classification with Naive Bayes, using sklearn... --- text_classification_Naive_Bayes.ipynb | 556 ++++++++++++++++++++++++++ 1 file changed, 556 insertions(+) create mode 100644 text_classification_Naive_Bayes.ipynb diff --git a/text_classification_Naive_Bayes.ipynb b/text_classification_Naive_Bayes.ipynb new file mode 100644 index 0000000..50ecb25 --- /dev/null +++ b/text_classification_Naive_Bayes.ipynb @@ -0,0 +1,556 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "# A Vector Space Model, with scikit-learn Naive Bayes" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/mam/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.\n", + " warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')\n" + ] + } + ], + "source": [ + "%matplotlib inline\n", + "import csv\n", + "import pandas\n", + "import sklearn\n", + "import numpy as np\n", + "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.cross_validation import StratifiedKFold, cross_val_score " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "50001\n" + ] + } + ], + "source": [ + "# Read the data\n", + "reviews = [line.rstrip() for line in open(\"/Users/mam/CORE/RESEARCH/DEEPLEARNING/Doc2Vec/data/aclImdb/alldata_2column.txt\")]\n", + "print(len(reviews))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\"label\",\"message\"\n", + "\"1\",\"bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life such as \"\" teachers \"\" . my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is \"\" teachers \"\" . the scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line : inspector : i'm here to sack one of your teachers . student : welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn't !\"\n" + ] + } + ], + "source": [ + "# The data have a header and we print it\n", + "print(reviews[0])\n", + "# print first data point.\n", + "# data format is each review as a line, csv\n", + "# clomun one is the sentiment tag --> 1=positive sentiment, 0=negative sentiment\n", + "# column 2 is the review\n", + "print(reviews[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>label</th>\n", + " <th>message</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>\"label\"</td>\n", + " <td>\"message\"</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>\"1\"</td>\n", + " <td>\"bromwell high is a cartoon comedy . it ran at...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>\"1\"</td>\n", + " <td>\"homelessness ( or houselessness as george car...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>\"1\"</td>\n", + " <td>\"brilliant over-acting by lesley ann warren . ...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>\"1\"</td>\n", + " <td>\"this is easily the most underrated film inn t...</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " label message\n", + "0 \"label\" \"message\"\n", + "1 \"1\" \"bromwell high is a cartoon comedy . it ran at...\n", + "2 \"1\" \"homelessness ( or houselessness as george car...\n", + "3 \"1\" \"brilliant over-acting by lesley ann warren . ...\n", + "4 \"1\" \"this is easily the most underrated film inn t..." + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Let's actually read the file again with pandas\n", + "import csv\n", + "import pandas as pd\n", + "reviews = pd.read_csv(\"/Users/mam/CORE/RESEARCH/DEEPLEARNING/Doc2Vec/data/aclImdb/alldata_2column.txt\",\\\n", + " sep=',', quoting=csv.QUOTE_NONE, names=[\"label\", \"message\"])\n", + "\n", + "# Let's print a preview with the \"head\" command\n", + "reviews.head(n=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/mam/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:516: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=10.\n", + " % (min_labels, self.n_folds)), Warning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0.82083583 0.8122 0.8234 0.8028 0.8204 0.834 0.8218\n", + " 0.8372 0.8034 0.8082 ]\n" + ] + } + ], + "source": [ + "reviews_data=reviews[\"message\"]\n", + "reviews_tags=reviews[\"label\"]\n", + "\n", + "pipeline = Pipeline([\n", + " ('bow', CountVectorizer(analyzer='word')), # get counts of tokens\n", + " ('tfidf', TfidfTransformer()), # get tf-idf scores\n", + " ('classifier', MultinomialNB()), # train on tf-idf vectors with the Naive Bayes classifier\n", + "])\n", + "\n", + "# Do 10-fold cross validation\n", + "scores = cross_val_score(pipeline, \n", + " reviews_data, \n", + " reviews_tags, \n", + " cv=10, \n", + " scoring='accuracy',\n", + " n_jobs=-1, # use all machine cores\n", + " )\n", + "print(scores)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.818423583283\n" + ] + } + ], + "source": [ + "avg= sum(scores/10.0)\n", + "print(avg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Code fr " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'last_letter': 'x'}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import nltk\n", + "def gender_features(word):\n", + " return {'last_letter': word[-1]}\n", + "gender_features('Alex')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'last_letter': 'e'}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gender_features('Nicole')" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from nltk.corpus import names\n", + "labeled_names = ([(name, 'male') for name in names.words('male.txt')] +\\\n", + " [(name, 'female') for name in names.words('female.txt')])\n", + "import random\n", + "random.shuffle(labeled_names)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]\n", + "train_set, test_set = featuresets[500:], featuresets[:500]\n", + "classifier = nltk.NaiveBayesClassifier.train(train_set)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "({'last_letter': u'i'}, 'female')" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "featuresets[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[({'last_letter': u'i'}, 'female'),\n", + " ({'last_letter': u'b'}, 'male'),\n", + " ({'last_letter': u't'}, 'male'),\n", + " ({'last_letter': u'e'}, 'female'),\n", + " ({'last_letter': u'n'}, 'male'),\n", + " ({'last_letter': u'y'}, 'female'),\n", + " ({'last_letter': u'e'}, 'female'),\n", + " ({'last_letter': u'a'}, 'female'),\n", + " ({'last_letter': u'e'}, 'female'),\n", + " ({'last_letter': u'a'}, 'female')]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_set[0:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'female'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classifier.classify(gender_features('Rebecca'))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'male'" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classifier.classify(gender_features('Jordon'))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'female'" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classifier.classify(gender_features('Vivienne'))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.746\n" + ] + } + ], + "source": [ + "print(nltk.classify.accuracy(classifier, test_set))" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Most Informative Features\n", + " last_letter = u'a' female : male = 34.4 : 1.0\n", + " last_letter = u'k' male : female = 32.7 : 1.0\n", + " last_letter = u'f' male : female = 16.6 : 1.0\n", + " last_letter = u'p' male : female = 11.9 : 1.0\n", + " last_letter = u'v' male : female = 11.2 : 1.0\n" + ] + } + ], + "source": [ + "classifier.show_most_informative_features(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from nltk.classify import apply_features\n", + "train_set = apply_features(gender_features, labeled_names[500:])\n", + "test_set = apply_features(gender_features, labeled_names[:500])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "nltk.util.LazyMap" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(train_set)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "({'last_letter': u'i'}, 'female')\n" + ] + } + ], + "source": [ + "print(train_set[0])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 095f9cc74969fd94da20b594e158d1c61b7ac656 Mon Sep 17 00:00:00 2001 From: Muhammad Abdul-Mageed <mumageed@gmail.com> Date: Mon, 27 Mar 2017 15:24:43 -0700 Subject: [PATCH 09/11] Add files via upload --- text_classification_Naive_Bayes.ipynb | 308 +------------------------- 1 file changed, 1 insertion(+), 307 deletions(-) diff --git a/text_classification_Naive_Bayes.ipynb b/text_classification_Naive_Bayes.ipynb index 50ecb25..461e416 100644 --- a/text_classification_Naive_Bayes.ipynb +++ b/text_classification_Naive_Bayes.ipynb @@ -220,316 +220,10 @@ } ], "source": [ + "# Let's get average accuracy...\n", "avg= sum(scores/10.0)\n", "print(avg)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Code fr " - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'last_letter': 'x'}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import nltk\n", - "def gender_features(word):\n", - " return {'last_letter': word[-1]}\n", - "gender_features('Alex')" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'last_letter': 'e'}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gender_features('Nicole')" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from nltk.corpus import names\n", - "labeled_names = ([(name, 'male') for name in names.words('male.txt')] +\\\n", - " [(name, 'female') for name in names.words('female.txt')])\n", - "import random\n", - "random.shuffle(labeled_names)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]\n", - "train_set, test_set = featuresets[500:], featuresets[:500]\n", - "classifier = nltk.NaiveBayesClassifier.train(train_set)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "({'last_letter': u'i'}, 'female')" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "featuresets[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[({'last_letter': u'i'}, 'female'),\n", - " ({'last_letter': u'b'}, 'male'),\n", - " ({'last_letter': u't'}, 'male'),\n", - " ({'last_letter': u'e'}, 'female'),\n", - " ({'last_letter': u'n'}, 'male'),\n", - " ({'last_letter': u'y'}, 'female'),\n", - " ({'last_letter': u'e'}, 'female'),\n", - " ({'last_letter': u'a'}, 'female'),\n", - " ({'last_letter': u'e'}, 'female'),\n", - " ({'last_letter': u'a'}, 'female')]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_set[0:10]" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'female'" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "classifier.classify(gender_features('Rebecca'))" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'male'" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "classifier.classify(gender_features('Jordon'))" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "'female'" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "classifier.classify(gender_features('Vivienne'))" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.746\n" - ] - } - ], - "source": [ - "print(nltk.classify.accuracy(classifier, test_set))" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Most Informative Features\n", - " last_letter = u'a' female : male = 34.4 : 1.0\n", - " last_letter = u'k' male : female = 32.7 : 1.0\n", - " last_letter = u'f' male : female = 16.6 : 1.0\n", - " last_letter = u'p' male : female = 11.9 : 1.0\n", - " last_letter = u'v' male : female = 11.2 : 1.0\n" - ] - } - ], - "source": [ - "classifier.show_most_informative_features(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from nltk.classify import apply_features\n", - "train_set = apply_features(gender_features, labeled_names[500:])\n", - "test_set = apply_features(gender_features, labeled_names[:500])" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "nltk.util.LazyMap" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "type(train_set)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "({'last_letter': u'i'}, 'female')\n" - ] - } - ], - "source": [ - "print(train_set[0])" - ] } ], "metadata": { From c53817fbf1aeefd5da864ca0afda038398ff99b5 Mon Sep 17 00:00:00 2001 From: Muhammad Abdul-Mageed <mumageed@gmail.com> Date: Fri, 31 Mar 2017 12:35:34 -0700 Subject: [PATCH 10/11] Adding a first pandas tutorial... --- pandas_tutorial_1.ipynb | 727 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 727 insertions(+) create mode 100644 pandas_tutorial_1.ipynb diff --git a/pandas_tutorial_1.ipynb b/pandas_tutorial_1.ipynb new file mode 100644 index 0000000..8364d0e --- /dev/null +++ b/pandas_tutorial_1.ipynb @@ -0,0 +1,727 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pandas:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Pandas has two core data structures: Series & DataFrame" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Series" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 2\n", + "1 4\n", + "2 6\n", + "3 8\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from pandas import Series, DataFrame\n", + "counts= Series([2, 4, 6, 8])\n", + "print(counts)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2 4 6 8]\n" + ] + } + ], + "source": [ + "print(counts.values)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Int64Index([0, 1, 2, 3], dtype='int64')\n" + ] + } + ], + "source": [ + "print(counts.index)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a 2\n", + "b 4\n", + "c 6\n", + "d 8\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "# We can create customized indexes:\n", + "counts= Series([2, 4, 6, 8], index=[\"a\", \"b\", \"c\", \"d\"])\n", + "print(counts)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2\n" + ] + } + ], + "source": [ + "# We can use the indexes to access values:\n", + "# Note: We need to use quotes around an index:\n", + "print(counts[\"a\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a 2\n", + "b 4\n", + "c 22\n", + "d 8\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "# Reassign:\n", + "counts[\"c\"]=22\n", + "print(counts)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a 2\n", + "b 4\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "print(counts[[\"a\", \"b\"]]) # Note the double square brackets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a 7.389056e+00\n", + "b 5.459815e+01\n", + "c 3.584913e+09\n", + "d 2.980958e+03\n", + "dtype: float64\n" + ] + } + ], + "source": [ + "# We can perform operations, similar to Numpy, while preserving the index values\n", + "print(np.exp(counts))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a 0.6\n", + "b 1.2\n", + "c 6.6\n", + "d 2.4\n", + "dtype: float64\n" + ] + } + ], + "source": [ + "print(counts*0.3)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a 2\n", + "b 4\n", + "c 22\n", + "d 8\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "print(counts)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Alex 10\n", + "Evan 20\n", + "Gabi 15\n", + "John 12\n", + "Juan 20\n", + "Mary 13\n", + "Noha 9\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "# We can create a Series from a Python dictionary:\n", + "d={\"Alex\": 10, \"John\": 12, \"Mary\": 13, \"Gabi\": 15, \"Noha\": 9,\\\n", + " \"Juan\": 20, \"Evan\": 20}\n", + "grades=Series(d)\n", + "print(grades)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Alex False\n", + "Evan True\n", + "Gabi True\n", + "John False\n", + "Juan True\n", + "Mary True\n", + "Noha False\n", + "dtype: bool\n" + ] + } + ], + "source": [ + "print(grades > 12)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Alex 12.0\n", + "Evan 24.0\n", + "Gabi 18.0\n", + "John 14.4\n", + "Juan 24.0\n", + "Mary 15.6\n", + "Noha 10.8\n", + "dtype: float64\n" + ] + } + ], + "source": [ + "raised= grades * 1.2\n", + "print(raised)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " age courses names\n", + "0 25 Python Alex\n", + "1 27 Perl John\n", + "2 32 Deep Learning Mary\n", + "3 19 Pattern Recognition Gabi\n", + "4 23 Data Mining Noha\n", + "5 20 Computational Archives Juan\n", + "6 21 Health Informatics Evan\n" + ] + } + ], + "source": [ + "# The DataFrame is a (possibly heterogeneous) spreadsheet-like (think Excel) data structure\n", + "# that enables both row and column indexing. Intutively, we can think about a DataFrame as \n", + "# a dict of Series\n", + "\n", + "data= {\"courses\": [\"Python\", \"Perl\", \"Deep Learning\", \"Pattern Recognition\", \"Data Mining\",\\\n", + " \"Computational Archives\", \"Health Informatics\"],\n", + " \"age\": [25, 27, 32, 19, 23, 20, 21],\n", + " \"names\": [\"Alex\", \"John\", \"Mary\", \"Gabi\", \"Noha\", \"Juan\", \"Evan\"]}\n", + "\n", + " \n", + "frame=DataFrame(data)\n", + "print(frame)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " age courses names\n", + "s1 25 Python Alex\n", + "s2 27 Perl John\n", + "s3 32 Deep Learning Mary\n", + "s4 19 Pattern Recognition Gabi\n", + "s5 23 Data Mining Noha\n", + "s6 20 Computational Archives Juan\n", + "s7 21 Health Informatics Evan\n" + ] + } + ], + "source": [ + "frame=DataFrame(data, index=[\"s1\", \"s2\", \"s3\", \"s4\", \"s5\", \"s6\", \"s7\" ])\n", + "print(frame)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "s1 Alex\n", + "s2 John\n", + "s3 Mary\n", + "s4 Gabi\n", + "s5 Noha\n", + "s6 Juan\n", + "s7 Evan\n", + "Name: names, dtype: object\n" + ] + } + ], + "source": [ + "print(frame[\"names\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "s1 Alex\n", + "s2 John\n", + "s3 Mary\n", + "s4 Gabi\n", + "s5 Noha\n", + "s6 Juan\n", + "s7 Evan\n", + "Name: names, dtype: object\n" + ] + } + ], + "source": [ + "print(frame.names)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "age 25\n", + "courses Python\n", + "names Alex\n", + "Name: s1, dtype: object\n" + ] + } + ], + "source": [ + "# Rows can be retrieved by e.g., the \"ix\" indexing field:\n", + "print(frame.ix[\"s1\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "age 27\n", + "courses Perl\n", + "names John\n", + "Name: s2, dtype: object\n" + ] + } + ], + "source": [ + "print(frame.ix[\"s2\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " age courses names School\n", + "s1 25 Python Alex UBC\n", + "s2 27 Perl John UBC\n", + "s3 32 Deep Learning Mary UBC\n", + "s4 19 Pattern Recognition Gabi UBC\n", + "s5 23 Data Mining Noha UBC\n", + "s6 20 Computational Archives Juan UBC\n", + "s7 21 Health Informatics Evan UBC\n" + ] + } + ], + "source": [ + "# add a coulmn\n", + "frame[\"School\"]=\"UBC\"\n", + "print(frame)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index([u'age', u'courses', u'names', u'School'], dtype='object')\n" + ] + } + ], + "source": [ + "print(frame.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index([u's1', u's2', u's3', u's4', u's5', u's6', u's7'], dtype='object')\n" + ] + } + ], + "source": [ + "print(frame.index)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "amazement 544395\n", + "loathing 74923\n", + "admiration 65759\n", + "grief 42947\n", + "terror 35705\n", + "ecstasy 30206\n", + "rage 8738\n", + "vigilance 695\n", + "Name: label, dtype: int64\n" + ] + } + ], + "source": [ + "import statsmodels.api as sm\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from patsy import dmatrices\n", + "from random import shuffle, randint, sample\n", + "import seaborn as sns\n", + "import numpy as np\n", + "%matplotlib inline\n", + "\n", + "emotion = pd.read_csv('emotions_p1_extended_lang_id_noduplic_denoised.csv', delimiter=',', header=0)\n", + "#----------------------------------------\n", + "print(pd.value_counts(emotion[\"label\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>message_id</th>\n", + " <th>message</th>\n", + " <th>label</th>\n", + " <th>lang_id</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>596908062054555648</td>\n", + " <td>Last week , Yuki Kawauchi ran 3 HM in 3 consec...</td>\n", + " <td>admiration</td>\n", + " <td>en</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>257202468386115584</td>\n", + " <td>Had a Turkish bath today . #amazing</td>\n", + " <td>amazement</td>\n", + " <td>en</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>223865330487930880</td>\n", + " <td>Taking my 6yo niece shopping #imintrouble #goi...</td>\n", + " <td>ecstasy</td>\n", + " <td>nl</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>411617825149566976</td>\n", + " <td><USER> <USER> <USER> Britt and I tried for so ...</td>\n", + " <td>grief</td>\n", + " <td>en</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>267380735453835264</td>\n", + " <td>I love this new song of one direction gotta ad...</td>\n", + " <td>amazement</td>\n", + " <td>en</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " message_id message \\\n", + "0 596908062054555648 Last week , Yuki Kawauchi ran 3 HM in 3 consec... \n", + "1 257202468386115584 Had a Turkish bath today . #amazing \n", + "2 223865330487930880 Taking my 6yo niece shopping #imintrouble #goi... \n", + "3 411617825149566976 <USER> <USER> <USER> Britt and I tried for so ... \n", + "4 267380735453835264 I love this new song of one direction gotta ad... \n", + "\n", + " label lang_id \n", + "0 admiration en \n", + "1 amazement en \n", + "2 ecstasy nl \n", + "3 grief en \n", + "4 amazement en " + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "emotion.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From c42eb163f18d8bf88f806262afcdfb58636e1021 Mon Sep 17 00:00:00 2001 From: Muhammad Abdul-Mageed <mumageed@gmail.com> Date: Fri, 31 Mar 2017 12:37:59 -0700 Subject: [PATCH 11/11] Updating numpy tutorial. --- python_tutorial_part_4_numpy.ipynb | 540 ++++++++++++++++++++++++++--- 1 file changed, 500 insertions(+), 40 deletions(-) diff --git a/python_tutorial_part_4_numpy.ipynb b/python_tutorial_part_4_numpy.ipynb index ad848ce..84940f2 100644 --- a/python_tutorial_part_4_numpy.ipynb +++ b/python_tutorial_part_4_numpy.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 64, "metadata": { "collapsed": false }, @@ -20,7 +20,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "<type 'numpy.ndarray'>\n", + " <type 'numpy.ndarray'>\n", "a --> [2 3 4 5]\n", "b --> [5 6 7 8]\n", "a+b --> [ 7 9 11 13]\n" @@ -28,20 +28,304 @@ } ], "source": [ - "from numpy import *\n", - "#from numpy import array\n", + "# Import numpy, conventionally as \"np\"\n", "import numpy as np\n", - "a= array([2,3,4,5])\n", - "b=array((5,6,7,8))\n", + "# Numpy enables creation of N-dimensional arrays of data, or ndarrays\n", + "a=np.array([2,3,4,5])\n", + "b=np.array((5,6,7,8))\n", "print type(a)\n", "print \"a -->\", a\n", "print \"b -->\", b\n", - "print \"a+b -->\", a+b\n" + "print \"a+b -->\", a+b" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(4,)\n" + ] + } + ], + "source": [ + "# We can get the shape of the array, which is a tuple of the sizes of its dimensions\n", + "print(a.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n" + ] + } + ], + "source": [ + "z=np.zeros(10)\n", + "print(z)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(10,)\n" + ] + } + ], + "source": [ + "print(z.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1\n" + ] + } + ], + "source": [ + "print(z.ndim)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]]\n" + ] + } + ], + "source": [ + "# If we had an 2*5 ndarray, and we can intialize with \"zeros\" or \"ones\":\n", + "x=np.zeros([2, 5])\n", + "print(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(2, 5)\n", + "2\n" + ] + } + ], + "source": [ + "print(x.shape)\n", + "print(x.ndim)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0. 0. 0. 0. 0.]]\n" + ] + } + ], + "source": [ + "# Or we can initialize with a shape of 4, 9:\n", + "x=np.zeros([4, 9])\n", + "print(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(4, 9)\n", + "2\n" + ] + } + ], + "source": [ + "print(x.shape)\n", + "print(x.ndim)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[[ 1. 1. 1.]\n", + " [ 1. 1. 1.]\n", + " [ 1. 1. 1.]\n", + " [ 1. 1. 1.]]\n", + "\n", + " [[ 1. 1. 1.]\n", + " [ 1. 1. 1.]\n", + " [ 1. 1. 1.]\n", + " [ 1. 1. 1.]]]\n" + ] + } + ], + "source": [ + "# We can also create an array of > 2 dimensions\n", + "# Consider the following from the documentation of scipy: https://docs.scipy.org/doc/numpy-dev/user/quickstart.html:\n", + "\"\"\"\n", + "When you print an array, NumPy displays it in a similar way to nested lists, but with the following layout:\n", + "\n", + " the last axis is printed from left to right,\n", + " the second-to-last is printed from top to bottom,\n", + " the rest are also printed from top to bottom, with each slice separated from the next by an empty line.\n", + "\n", + "One-dimensional arrays are then printed as rows, bidimensionals as matrices and tridimensionals as lists of matrices.\n", + "\"\"\"\n", + "x=np.ones([2, 4, 3])\n", + "print(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(2, 4, 3)\n", + "3\n" + ] + } + ], + "source": [ + "print(x.shape)\n", + "print(x.ndim)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[[[ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]]\n", + "\n", + " [[ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]]\n", + "\n", + " [[ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]]]\n", + "\n", + "\n", + " [[[ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]]\n", + "\n", + " [[ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]]\n", + "\n", + " [[ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]]]]\n" + ] + } + ], + "source": [ + "x=np.zeros([2, 3, 4, 5])\n", + "print(x)" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 40, "metadata": { "collapsed": false }, @@ -50,32 +334,168 @@ "name": "stdout", "output_type": "stream", "text": [ - "This will give an error!!!\n", - "a+c -->" + "(2, 3, 4, 5)\n", + "4\n" ] - }, + } + ], + "source": [ + "print(x.shape)\n", + "print(x.ndim)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [ { - "ename": "ValueError", - "evalue": "operands could not be broadcast together with shapes (4,) (6,) ", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m<ipython-input-4-5f9c99476f2e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m8\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m8\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m9\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0;34m\"This will give an error!!!\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mprint\u001b[0m \u001b[0;34m\"a+c -->\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ma\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mValueError\u001b[0m: operands could not be broadcast together with shapes (4,) (6,) " + "name": "stdout", + "output_type": "stream", + "text": [ + "float64\n" + ] + } + ], + "source": [ + "print(x.dtype)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[[1 1 1]\n", + " [1 1 1]\n", + " [1 1 1]\n", + " [1 1 1]]\n", + "\n", + " [[1 1 1]\n", + " [1 1 1]\n", + " [1 1 1]\n", + " [1 1 1]]]\n", + "int32\n" + ] + } + ], + "source": [ + "# Note array data type...\n", + "x=np.ones([2, 4, 3], dtype=np.int32)\n", + "print(x)\n", + "print(x.dtype)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Operations on arrays:" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[[5 5 5]\n", + " [5 5 5]\n", + " [5 5 5]\n", + " [5 5 5]]\n", + "\n", + " [[5 5 5]\n", + " [5 5 5]\n", + " [5 5 5]\n", + " [5 5 5]]]\n" ] } ], + "source": [ + "print(x*5)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[[ 0.2 0.2 0.2]\n", + " [ 0.2 0.2 0.2]\n", + " [ 0.2 0.2 0.2]\n", + " [ 0.2 0.2 0.2]]\n", + "\n", + " [[ 0.2 0.2 0.2]\n", + " [ 0.2 0.2 0.2]\n", + " [ 0.2 0.2 0.2]\n", + " [ 0.2 0.2 0.2]]]\n" + ] + } + ], + "source": [ + "print(x/5.0)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[[1 1 1]\n", + " [1 1 1]\n", + " [1 1 1]\n", + " [1 1 1]]]\n" + ] + } + ], + "source": [ + "# We can slice\n", + "my_slice=x[1:2]\n", + "print(my_slice)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], "source": [ "# You can only add arrays of the same shape / equal length:\n", - "c=array([5,8,8,9,5,2])\n", - "print \"This will give an error!!!\"\n", - "print \"a+c -->\", a+c" + "c=np.array([5,8,8,9,5,2])\n", + "# print \"This will give an error if you print it!!!\"\n", + "# print \"a+c -->\", a+c" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 51, "metadata": { "collapsed": false }, @@ -84,20 +504,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "a+1 --> [3 4 5 6]\n" + "a --> [2 3 4 5]\n", + "a+1 --> [4 5 6 7]\n" ] } ], "source": [ "# broadcasting\n", "# If you add an array to a scalar, the scalar gets broadcast across all the array elements\n", - "print \"a+1 -->\", a+1\n", + "print \"a -->\", a\n", + "print \"a+1 -->\", a+2\n", "# Now you can broadcast arrays and so you can add arrays of different shapes..." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 52, "metadata": { "collapsed": false }, @@ -116,16 +538,22 @@ } ], "source": [ - "import numpy as np\n", "x= np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=np.float32)\n", "print \"Printing array x: \", x,\"\\n\"\n", "print \"\\\"Shape of array x is:\\\" \", x.shape,\"\\n\"\n", "print \"\\\"Value at x[0][1] is:\\\" \", x[0][1] # gives row0, c1 --> we start index from zero!" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## More operations" + ] + }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 53, "metadata": { "collapsed": false }, @@ -149,7 +577,43 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 55, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 1 6 15 6]\n" + ] + } + ], + "source": [ + "print(x*y)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# You cannot do the below:\n", + "# You will get an error:\n", + "# ValueError: operands could not be broadcast together with shapes (4,) (5,)\n", + "x=np.array([1, 3, 5, 6])\n", + "y=np.array([1,2,3,1, 9])\n", + "d=y[1:]-y[:-1]\n", + "print(x*y)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, "metadata": { "collapsed": false }, @@ -166,12 +630,12 @@ "source": [ "print sum(a)\n", "# cumsum adds every emelement to the previous element\n", - "print cumsum(a)" + "print np.cumsum(a)" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 61, "metadata": { "collapsed": false }, @@ -186,14 +650,12 @@ "-------------------\n", "[2 3 4 5 6]\n", "-------------------\n", - "[2 4 6]\n", - "-------------------\n", - "[ 100. 215.443469 464.15888336 1000. ]\n" + "[ 2 7 12 17 22 27 32 37 42 47]\n", + "-------------------\n" ] } ], "source": [ - "import numpy as np\n", "#numpy.arange: http://docs.scipy.org/doc/numpy/reference/generated/numpy.arange.html\n", "\"\"\"\n", "numpy.arange([start, ]stop, [step, ]dtype=None)\n", @@ -208,7 +670,7 @@ "print \"-------------------\"\n", "print np.arange(2,7)\n", "print \"-------------------\"\n", - "print np.arange(2,7, 2)\n", + "print np.arange(2,50, 5)\n", "print \"-------------------\"" ] }, @@ -255,7 +717,6 @@ } ], "source": [ - "import numpy as np\n", "#------------------\n", "print \"numpy.zeros\"\n", "#------------------\n", @@ -336,12 +797,11 @@ } ], "source": [ - "import numpy as np\n", "#------------------\n", "print \"\\n numpy.linspace\"\n", "#------------------\n", "\"\"\"\n", - " numpy.linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None)[source]¶\n", + " numpy.linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None)[source]\n", " Return evenly spaced numbers over a specified interval.\n", " Returns num evenly spaced samples, calculated over the interval [start, stop].\n", " The endpoint of the interval can optionally be excluded.\n", @@ -392,7 +852,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.10" + "version": "2.7.12" } }, "nbformat": 4,