diff --git a/README.md b/README.md index a97e86f..44fea55 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,18 @@ # A Python Tutorial: -This is code I wrote for courses I teach at Indiana University. -The first parts of the code in this tutorial are meant for Python beginners, and the code grows more advanced as you advance through the later parts. +This is code I wrote for courses I taught at Indiana University and then University of British Columbia. +The first parts of the code in this tutorial are meant for Python beginners, and the code grows more advanced as in later parts. -In the context of this tutorial, I plan to include sections covering the Natural Language Toolkit (NLTK), gensim, scikit-learn, visualization, numpy, etc. +In the context of this tutorial, I have added sections covering processing text, use of the Natural Language Toolkit (NLTK), gensim, scikit-learn. I plan to add parts on visualization, numpy, etc. In addition, I plan to add more advanced code covering practical machine learning issues like vector space models to perform certain tasks like sentiment analysis. Finally, I also plan to introduce some deep learning tools and provide some relevant code. -The courses teach skills for at the intersection of fields like natural language processing, machine learning, social media mining, text mining, data science, etc. +The courses teach skills data science skills (i.e, skills at the intersection of natural language processing, applied machine learning, and social media mining). The code is written primarily in Python 2.7. A migration to Python 3 shoul be straightforward. + Some of the code is written and run during class sessions and so it is shared without much polishing. +In some places, you may find some repetition (primarily for pedagogical purposes inside class). I provide some comments, before I push here, as much as I can. diff --git a/classes.ipynb b/classes.ipynb new file mode 100644 index 0000000..5440c5d --- /dev/null +++ b/classes.ipynb @@ -0,0 +1,317 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# A Quick Look at Python Classes" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "678678\n", + "Alex\n", + "['Python', 'Soical Media Intelligence']\n", + "Alex has uid 678678, and is taking: ['Python', 'Soical Media Intelligence']\n", + "\n", + "\n", + " A student class, holding name, id, and courses taken...\n", + " \n" + ] + } + ], + "source": [ + "class Student(object):\n", + " \"\"\"\n", + " A student class, holding name, id, and courses taken...\n", + " \"\"\"\n", + " def __init__(self, name, student_id, courses):\n", + " self.name=name\n", + " self.student_id = student_id\n", + " self.courses = courses\n", + "\n", + " def get_id(self):\n", + " return self.student_id\n", + " \n", + " def get_name(self):\n", + " return self.name\n", + "\n", + " def get_courses(self):\n", + " return self.courses\n", + "\n", + " def __str__(self):\n", + " return \"%s has uid %s, and is taking: %s\\n\" % (self.name, self.student_id, self.courses)\n", + "\n", + "#--------------------------------------------------------------------\n", + "alex=Student(\"Alex\", 678678, [\"Python\", \"Soical Media Intelligence\"])\n", + "print(alex.get_id())\n", + "print(alex.get_name())\n", + "print(alex.get_courses())\n", + "#----------\n", + "print(alex)\n", + "print(Student.__doc__)\n", + "#---------------------------------------------------------------------" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7675456353\n", + "Sara\n", + "Deep Learning\n", + "Sara has uid 7675456353, and is taking: Deep Learning\n", + "\n" + ] + } + ], + "source": [ + "#--------------------------------------------------------------------\n", + "sara=Student(\"Sara\", 7675456353, \"Deep Learning\")\n", + "print(sara.get_id())\n", + "print(sara.get_name())\n", + "print(sara.get_courses())\n", + "#----------\n", + "#print(sara)\n", + "#---------------------------------------------------------------------" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "- Title of the Book: Deep Learning for NLP\n", + "- Price of the Book: 100\n", + "- New price of the Book: 180\n" + ] + } + ], + "source": [ + "class Book(object):\n", + " \"\"\"\n", + " A Book class with some getters! (Bad doc!)\n", + " \"\"\"\n", + " def __init__(self, title, b_id, price):\n", + " self.title = title\n", + " self.b_id = b_id\n", + " self.price = price\n", + "\n", + " def get_id(self):\n", + " return self.b_id\n", + " \n", + " def get_title(self):\n", + " return self.title\n", + "\n", + " def get_price(self):\n", + " return self.price\n", + " \n", + " def update_price(self, price):\n", + " self.price =price\n", + " \n", + "deep_learning=Book(\"Deep Learning for NLP\", \"888-22-33308\", 100)\n", + "\n", + "print('- Title of the Book: {}').format(deep_learning.get_title())\n", + "print('- Price of the Book: {}').format(deep_learning.get_price())\n", + "# Update the price\n", + "deep_learning.update_price(180)\n", + "print('- New price of the Book: {}').format(deep_learning.get_price())" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "- Title of the Book: Deep Learning for NLP\n", + "- Price of the Book: 100\n", + "- New price of the Book: 180\n" + ] + } + ], + "source": [ + "class Book(object):\n", + " \"\"\"\n", + " A Book class with some getters! (Bad doc!)\n", + " \"\"\"\n", + " def __init__(self, title, b_id, price):\n", + " self.title = title\n", + " self.b_id = b_id\n", + " self.price = price\n", + "\n", + "# def get_id(self):\n", + "# return self.b_id\n", + " \n", + "# def get_title(self):\n", + "# return self.title\n", + "\n", + "# def get_price(self):\n", + "# return self.price\n", + " \n", + " def update_price(self, new_price):\n", + " self.price = new_price\n", + " \n", + "deep_learning=Book(\"Deep Learning for NLP\", \"888-22-33308\", 100)\n", + "\n", + "print('- Title of the Book: {}').format(deep_learning.title)\n", + "print('- Price of the Book: {}').format(deep_learning.price)\n", + "# Update the price\n", + "deep_learning.update_price(180)\n", + "print('- New price of the Book: {}').format(deep_learning.price)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " A Book class with some getters! (Bad doc!)\n", + " \n" + ] + } + ], + "source": [ + "print(deep_learning.__doc__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Subclassing" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "- Issue of the Magazine: 12-2\n", + "- Title of the Magazine: Time\n", + "- Price of the Magazine: 10\n", + "- New price of the Magazine: 15\n" + ] + } + ], + "source": [ + "class Magazine(Book):\n", + " \"\"\"\n", + " Subclass of the Book class...\n", + " Only adds the method to get issue info.\n", + " \"\"\"\n", + " def __init__(self, title, b_id, price, issue):\n", + " # Just invoke the __init__ for the parent class\n", + " Book.__init__(self, title, b_id, price)\n", + " self.issue = issue\n", + "\n", + " def get_issue(self):\n", + " return self.issue\n", + " \n", + "time=Magazine(\"Time\", \"000-22-4444\", 10, \"12-2\")\n", + "\n", + "print('- Issue of the Magazine: {}').format(time.get_issue())\n", + "#-----------------------------------------------------------\n", + "# Everything else works like it should with the parent class\n", + "print('- Title of the Magazine: {}').format(time.get_title())\n", + "print('- Price of the Magazine: {}').format(time.get_price())\n", + "# Update the price\n", + "time.update_price(15)\n", + "print('- New price of the Magazine: {}').format(time.get_price())" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " Subclass of the Book class...\n", + " Only adds the method to get issue info.\n", + " \n" + ] + } + ], + "source": [ + "# Note: Subclass does not inherit doc from parent class:\n", + "print(time.__doc__)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/pandas_tutorial_1.ipynb b/pandas_tutorial_1.ipynb new file mode 100644 index 0000000..8364d0e --- /dev/null +++ b/pandas_tutorial_1.ipynb @@ -0,0 +1,727 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pandas:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Pandas has two core data structures: Series & DataFrame" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Series" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 2\n", + "1 4\n", + "2 6\n", + "3 8\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from pandas import Series, DataFrame\n", + "counts= Series([2, 4, 6, 8])\n", + "print(counts)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2 4 6 8]\n" + ] + } + ], + "source": [ + "print(counts.values)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Int64Index([0, 1, 2, 3], dtype='int64')\n" + ] + } + ], + "source": [ + "print(counts.index)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a 2\n", + "b 4\n", + "c 6\n", + "d 8\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "# We can create customized indexes:\n", + "counts= Series([2, 4, 6, 8], index=[\"a\", \"b\", \"c\", \"d\"])\n", + "print(counts)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2\n" + ] + } + ], + "source": [ + "# We can use the indexes to access values:\n", + "# Note: We need to use quotes around an index:\n", + "print(counts[\"a\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a 2\n", + "b 4\n", + "c 22\n", + "d 8\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "# Reassign:\n", + "counts[\"c\"]=22\n", + "print(counts)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a 2\n", + "b 4\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "print(counts[[\"a\", \"b\"]]) # Note the double square brackets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a 7.389056e+00\n", + "b 5.459815e+01\n", + "c 3.584913e+09\n", + "d 2.980958e+03\n", + "dtype: float64\n" + ] + } + ], + "source": [ + "# We can perform operations, similar to Numpy, while preserving the index values\n", + "print(np.exp(counts))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a 0.6\n", + "b 1.2\n", + "c 6.6\n", + "d 2.4\n", + "dtype: float64\n" + ] + } + ], + "source": [ + "print(counts*0.3)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a 2\n", + "b 4\n", + "c 22\n", + "d 8\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "print(counts)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Alex 10\n", + "Evan 20\n", + "Gabi 15\n", + "John 12\n", + "Juan 20\n", + "Mary 13\n", + "Noha 9\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "# We can create a Series from a Python dictionary:\n", + "d={\"Alex\": 10, \"John\": 12, \"Mary\": 13, \"Gabi\": 15, \"Noha\": 9,\\\n", + " \"Juan\": 20, \"Evan\": 20}\n", + "grades=Series(d)\n", + "print(grades)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Alex False\n", + "Evan True\n", + "Gabi True\n", + "John False\n", + "Juan True\n", + "Mary True\n", + "Noha False\n", + "dtype: bool\n" + ] + } + ], + "source": [ + "print(grades > 12)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Alex 12.0\n", + "Evan 24.0\n", + "Gabi 18.0\n", + "John 14.4\n", + "Juan 24.0\n", + "Mary 15.6\n", + "Noha 10.8\n", + "dtype: float64\n" + ] + } + ], + "source": [ + "raised= grades * 1.2\n", + "print(raised)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " age courses names\n", + "0 25 Python Alex\n", + "1 27 Perl John\n", + "2 32 Deep Learning Mary\n", + "3 19 Pattern Recognition Gabi\n", + "4 23 Data Mining Noha\n", + "5 20 Computational Archives Juan\n", + "6 21 Health Informatics Evan\n" + ] + } + ], + "source": [ + "# The DataFrame is a (possibly heterogeneous) spreadsheet-like (think Excel) data structure\n", + "# that enables both row and column indexing. Intutively, we can think about a DataFrame as \n", + "# a dict of Series\n", + "\n", + "data= {\"courses\": [\"Python\", \"Perl\", \"Deep Learning\", \"Pattern Recognition\", \"Data Mining\",\\\n", + " \"Computational Archives\", \"Health Informatics\"],\n", + " \"age\": [25, 27, 32, 19, 23, 20, 21],\n", + " \"names\": [\"Alex\", \"John\", \"Mary\", \"Gabi\", \"Noha\", \"Juan\", \"Evan\"]}\n", + "\n", + " \n", + "frame=DataFrame(data)\n", + "print(frame)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " age courses names\n", + "s1 25 Python Alex\n", + "s2 27 Perl John\n", + "s3 32 Deep Learning Mary\n", + "s4 19 Pattern Recognition Gabi\n", + "s5 23 Data Mining Noha\n", + "s6 20 Computational Archives Juan\n", + "s7 21 Health Informatics Evan\n" + ] + } + ], + "source": [ + "frame=DataFrame(data, index=[\"s1\", \"s2\", \"s3\", \"s4\", \"s5\", \"s6\", \"s7\" ])\n", + "print(frame)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "s1 Alex\n", + "s2 John\n", + "s3 Mary\n", + "s4 Gabi\n", + "s5 Noha\n", + "s6 Juan\n", + "s7 Evan\n", + "Name: names, dtype: object\n" + ] + } + ], + "source": [ + "print(frame[\"names\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "s1 Alex\n", + "s2 John\n", + "s3 Mary\n", + "s4 Gabi\n", + "s5 Noha\n", + "s6 Juan\n", + "s7 Evan\n", + "Name: names, dtype: object\n" + ] + } + ], + "source": [ + "print(frame.names)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "age 25\n", + "courses Python\n", + "names Alex\n", + "Name: s1, dtype: object\n" + ] + } + ], + "source": [ + "# Rows can be retrieved by e.g., the \"ix\" indexing field:\n", + "print(frame.ix[\"s1\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "age 27\n", + "courses Perl\n", + "names John\n", + "Name: s2, dtype: object\n" + ] + } + ], + "source": [ + "print(frame.ix[\"s2\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " age courses names School\n", + "s1 25 Python Alex UBC\n", + "s2 27 Perl John UBC\n", + "s3 32 Deep Learning Mary UBC\n", + "s4 19 Pattern Recognition Gabi UBC\n", + "s5 23 Data Mining Noha UBC\n", + "s6 20 Computational Archives Juan UBC\n", + "s7 21 Health Informatics Evan UBC\n" + ] + } + ], + "source": [ + "# add a coulmn\n", + "frame[\"School\"]=\"UBC\"\n", + "print(frame)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index([u'age', u'courses', u'names', u'School'], dtype='object')\n" + ] + } + ], + "source": [ + "print(frame.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index([u's1', u's2', u's3', u's4', u's5', u's6', u's7'], dtype='object')\n" + ] + } + ], + "source": [ + "print(frame.index)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "amazement 544395\n", + "loathing 74923\n", + "admiration 65759\n", + "grief 42947\n", + "terror 35705\n", + "ecstasy 30206\n", + "rage 8738\n", + "vigilance 695\n", + "Name: label, dtype: int64\n" + ] + } + ], + "source": [ + "import statsmodels.api as sm\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from patsy import dmatrices\n", + "from random import shuffle, randint, sample\n", + "import seaborn as sns\n", + "import numpy as np\n", + "%matplotlib inline\n", + "\n", + "emotion = pd.read_csv('emotions_p1_extended_lang_id_noduplic_denoised.csv', delimiter=',', header=0)\n", + "#----------------------------------------\n", + "print(pd.value_counts(emotion[\"label\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
message_idmessagelabellang_id
0596908062054555648Last week , Yuki Kawauchi ran 3 HM in 3 consec...admirationen
1257202468386115584Had a Turkish bath today . #amazingamazementen
2223865330487930880Taking my 6yo niece shopping #imintrouble #goi...ecstasynl
3411617825149566976<USER> <USER> <USER> Britt and I tried for so ...griefen
4267380735453835264I love this new song of one direction gotta ad...amazementen
\n", + "
" + ], + "text/plain": [ + " message_id message \\\n", + "0 596908062054555648 Last week , Yuki Kawauchi ran 3 HM in 3 consec... \n", + "1 257202468386115584 Had a Turkish bath today . #amazing \n", + "2 223865330487930880 Taking my 6yo niece shopping #imintrouble #goi... \n", + "3 411617825149566976 Britt and I tried for so ... \n", + "4 267380735453835264 I love this new song of one direction gotta ad... \n", + "\n", + " label lang_id \n", + "0 admiration en \n", + "1 amazement en \n", + "2 ecstasy nl \n", + "3 grief en \n", + "4 amazement en " + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "emotion.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/processing_raw_text.ipynb b/processing_raw_text.ipynb new file mode 100644 index 0000000..1bb913c --- /dev/null +++ b/processing_raw_text.ipynb @@ -0,0 +1,1010 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Processing Raw Text" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Also see: \n", + "## http://www.nltk.org/book/ch03.html, https://docs.python.org/2/howto/urllib2.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download a book from Project Gutenberg with Python:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Type of 'response' is :\n", + "Type of 'raw' is :\n" + ] + } + ], + "source": [ + "from urllib2 import Request, urlopen\n", + "\n", + "url=\"http://www.gutenberg.org/files/54255/54255-0.txt\"\n", + "response = urlopen(url)\n", + "raw = response.read().decode('utf8')\n", + "#--------------------------------------------------\n", + "# Check types...\n", + "print(\"Type of \\'response\\' is %s:\")% type(response)\n", + "print(\"Type of \\'raw\\' is %s:\")% type(raw)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The Project Gutenberg EBook of Narrative of Travels in Europe, Asia, and\r\n", + "Africa, in the Seventeenth Century, Volum, by Evliya Çelebi and Joseph Hammer-Purgstall\r\n", + "\r\n" + ] + } + ], + "source": [ + "print(raw[:165])" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['hey', ',', 'guys', ',', 'how', 'is', 'life', '?', '?', '?', '!']\n" + ] + } + ], + "source": [ + "from nltk import word_tokenize\n", + "t=\"hey, guys, how is life???!\"\n", + "tt =word_tokenize(t)\n", + "print(tt)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('hey', 'NN'), (',', ','), ('guys', 'NNS'), (',', ','), ('how', 'WRB'), ('is', 'VBZ'), ('life', 'NN'), ('?', '.'), ('?', '.'), ('?', '.'), ('!', '.')]\n" + ] + } + ], + "source": [ + "ttt = pos_tag(tt)\n", + "print(ttt)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tokenize and pos-tag the text:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "144822\n", + "144822\n" + ] + } + ], + "source": [ + "from nltk import word_tokenize, pos_tag\n", + "#------------------------------\n", + "tokens = word_tokenize(raw)\n", + "print(len(tokens))\n", + "tagged=pos_tag(tokens)\n", + "print(len(tagged))" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[u'\\ufeffThe', u'Project', u'Gutenberg', u'EBook', u'of', u'Narrative', u'of', u'Travels', u'in', u'Europe', u',', u'Asia', u',', u'and', u'Africa', u',', u'in', u'the', u'Seventeenth', u'Century', u',', u'Volum', u',', u'by', u'Evliya', u'\\xc7elebi', u'and', u'Joseph', u'Hammer-Purgstall', u'This', u'eBook', u'is', u'for', u'the', u'use', u'of', u'anyone', u'anywhere', u'in', u'the', u'United', u'States', u'and', u'most', u'other', u'parts', u'of', u'the', u'world', u'at']\n" + ] + } + ], + "source": [ + "print(tokens[:50]) # list of unicode items" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(u'\\ufeffThe', 'NN'), (u'Project', 'NNP'), (u'Gutenberg', 'NNP'), (u'EBook', 'NNP'), (u'of', 'IN'), (u'Narrative', 'NNP'), (u'of', 'IN'), (u'Travels', 'NNP'), (u'in', 'IN'), (u'Europe', 'NNP')]\n" + ] + } + ], + "source": [ + "print(tagged[:10]) # list of tuples (word,pos_tag pairs)" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['hello', 'hi']\n" + ] + } + ], + "source": [ + "wds=[\"hello\", \"hi\", \"life\"]\n", + "h_wds= [w for w in wds if w.startswith(\"h\")]\n", + "\n", + "\n", + "new_words=[]\n", + "for w in wds:\n", + " if w.startswith(\"h\"):\n", + " new_words.append(w)\n", + "print(new_words)" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['plays']\n" + ] + } + ], + "source": [ + "pairs=[ (\"Alex\", \"NN\"), (\"plays\", \"VBZ\") ]\n", + "verbs=[ x[0] for x in pairs if x[1]==\"VBZ\"]\n", + "print(verbs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Note: The pos tagger of course makes mistakes, but it performs reasonably well." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## List comprehension on \"tagged\"" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Project\n", + "Gutenberg\n", + "EBook\n", + "Narrative\n", + "Travels\n", + "Europe\n", + "Asia\n", + "Africa\n", + "Seventeenth\n", + "Century\n", + "Volum\n", + "Evliya\n", + "Çelebi\n", + "Joseph\n", + "Hammer-Purgstall\n", + "United\n", + "Project\n", + "Gutenberg\n", + "License\n", + "United\n", + "Europe\n", + "Asia\n", + "Africa\n", + "Seventeenth\n", + "Century\n", + "II\n", + "Evliya\n", + "Çelebi\n", + "Evliya\n", + "Çelebi\n", + "Joseph\n", + "Hammer-Purgstall\n", + "Release\n", + "Date\n", + "February\n", + "[\n", + "EBook\n", + "Character\n", + "***\n", + "START\n", + "THIS\n", + "PROJECT\n", + "GUTENBERG\n", + "EBOOK\n", + "NARRATIVE\n", + "OF\n", + "TRAVELS\n", + "***\n", + "Produced\n", + "Turgut\n" + ] + } + ], + "source": [ + "# Named enitities:\n", + "ne=[pair[0] for pair in tagged if pair[-1]==\"NNP\"]\n", + "for e in ne[:50]:\n", + " print(e)" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "breadth\n", + "remarkable\n", + "ruby-coloured\n", + "particular\n", + "tombs\n", + "gun-shot’s\n", + "yellow\n", + "rapid\n", + "mild\n", + "mile\n", + "sleep\n", + "legal\n", + "forty-six\n", + "Elephant\n", + "dish\n", + "follow\n", + "abundant\n", + "religious\n", + "washing-tubs\n", + "dreadful\n", + "seventy-seven\n", + "pardon\n", + "hunting\n", + "swam\n", + "outdated\n", + "becas\n", + "mosque\n", + "young\n", + "“Mevlúd-námeh\n", + "underwent\n", + "answered\n", + "tail\n", + "foster\n", + "obstinate\n", + "stable\n", + "suite\n", + "Precious\n", + "farsang’s\n", + "worth\n", + "orderly\n", + "virtuous\n", + "Sheikh-ul-islám\n", + "amorous\n", + "exempt\n", + "www.gutenberg.org\n", + "perishable\n", + "navigable\n", + "limpid\n", + "fat\n", + "father’s\n" + ] + } + ], + "source": [ + "# Adjectives\n", + "adjs= set([pair[0] for pair in tagged if pair[-1]==\"JJ\"]) # we pass the list to set to uniqify\n", + "adjs= list(adjs) #Cast to list again so that we access only few in print\n", + "# Note: 'set' object has no attribute '__getitem__' and so we cannot do adjs[:15] on a set\n", + "for a in adjs[:50]:\n", + " print(a)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "15238\n" + ] + } + ], + "source": [ + "# How many ne?; note these are not uniqified\n", + "print(len(ne))" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1263\n" + ] + } + ], + "source": [ + "# How many uniqe adjs?\n", + "print(len(adjs))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get collocations" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Project Gutenberg-tm; three hundred; hundred houses; Black Sea;\n", + "thousand men; two hundred; one hundred; great number; fifty aspers;\n", + "next day; Project Gutenberg; Uzún Hassan; three days; thousand houses;\n", + "five hours; Sultán Murad; Ahmed Páshá; Kizil Irmák; five hundred;\n", + "Mustafa Páshá\n" + ] + } + ], + "source": [ + "from nltk import Text\n", + "text=Text(tokens)\n", + "#print(type(text))\n", + "text.collocations()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Accessing webpages/html" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + " \n", + " \n", + " Sh\n" + ] + } + ], + "source": [ + "from bs4 import BeautifulSoup\n", + "url=\"http://www.bbc.com/news/technology-38892383\"\n", + "response = urlopen(url)\n", + "html = response.read().decode('utf8')\n", + "print(html[:200])" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shopping\n", + "robots\n", + "on\n", + "the\n", + "march\n", + "in\n", + "Ocado\n", + "-\n", + "BBC\n", + "News\n" + ] + } + ], + "source": [ + "raw = BeautifulSoup(html, \"lxml\").get_text()\n", + "tokens = word_tokenize(raw)\n", + "tok=tokens[:10]\n", + "for t in tok:\n", + " print(t)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Working with unicode" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2167789138\tمعَ فجر العام الجديد : رجوتُ إلهيَ أن يجعلني ويجعلكمِ من أسعدِ خلقهِ ، و يرزقني ويرزقكم أضعاافَ أمنيآتِكم حتَى ترضون ...صباحكم رضى||$||\"@jumana_sj2: ولا اقول عاادي كمان يدخلو اغاني كوريه خنضحك #عشاق_كوريا_يطالبون_قناة_mbc_بفتح_قناه_mbc_korea_مترجمه_بالعربيه_للمعجبين_العرب\"||$||\"@s_h_osho: نبي قناه كوريه ليش فيه قناه هنديه ومافيه كوريه؟ #عشاق_كوريا_يطالبون_قناة_mbc_بفتح_قناه_mbc_korea_مترجمه_بالعربيه_للمعجبين_العرب\"||$||\"@LINA_ALADEEB: بعيداً عن خيالات الحب احياناً السعاده تكون عباره ع\n" + ] + } + ], + "source": [ + "import codecs\n", + "ara_text=codecs.open(\"sample_concat.tsv\", \"r\", \"utf-8\").readlines()[0]\n", + "print(ara_text[:500])" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2167789138\tمع فجر العام الجديد : رجوت إلهي أن يجعلني ويجعلكم من أسعد خلقه ، و يرزقني ويرزقكم أضعااف أمنيآتكم حتى ترضون ...صباحكم رضى||$||\"@jumana_sj2: ولا اقول عاادي كمان يدخلو اغاني كوريه خنضحك #عشاق_كوريا_يطالبون_قناة_mbc_بفتح_قناه_mbc_korea_مترجمه_بالعربيه_للمعجبين_العرب\"||$||\"@s_h_osho: نبي قناه كوريه ليش فيه قناه هنديه ومافيه كوريه؟ #عشاق_كوريا_يطالبون_قناة_mbc_بفتح_قناه_mbc_korea_مترجمه_بالعربيه_للمعجبين_العرب\"||$||\"@LINA_ALADEEB: بعيدا عن خيالات الحب احيانا السعاده تكون عباره عن - برنامج \n" + ] + } + ], + "source": [ + "def remove_unicode_diac(text):\n", + " \"\"\"Takes Arabic in utf-8 and returns same text without diac\"\"\"\n", + " # Replace diacritics with nothing \n", + " text = text.replace(u\"\\u064B\", \"\")# fatHatayn\n", + " text = text.replace(u\"\\u064C\", \"\") # Dammatayn\n", + " text = text.replace(u\"\\u064D\", \"\")# kasratayn\n", + " text = text.replace(u\"\\u064E\", \"\")# fatHa\n", + " text = text.replace(u\"\\u064F\", \"\") # Damma\n", + " text = text.replace(u\"\\u0650\", \"\")# kasra\n", + " text = text.replace(u\"\\u0651\", \"\")# shaddah\n", + " text = text.replace(u\"\\u0652\", \"\")# sukuun\n", + " text = text.replace(u\"\\u0670\", \"`\") # dagger 'alif\n", + " return text\n", + "\n", + "ara_text_no_diac =remove_unicode_diac(ara_text)\n", + "print(ara_text_no_diac[:500])" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<type 'unicode'>\n" + ] + } + ], + "source": [ + "print(type(ara_text_no_diac))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Regular expressions preview!" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hey there, take a look: <URL> #love_robots!\n" + ] + } + ], + "source": [ + "import re\n", + "# This will replace the URL \"http://www.bbc.com/news/technology-38892383\" with a string token \"<URL>\"\n", + "tweet=\"Hey there, take a look: http://www.bbc.com/news #love_robots!\"\n", + "tweet = re.sub(r'https?://[^\\s<>\"]+|www\\.[^\\s<>\"]+', '<URL>',tweet)\n", + "print(tweet)" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['take']\n" + ] + } + ], + "source": [ + "e_ending=[w for w in tweet.split() if re.search('e$', w)]\n", + "print(e_ending) # Note that \"there,\" ends in \",\"" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['there', 'take']\n" + ] + } + ], + "source": [ + "import string\n", + "punc = [char for char in string.punctuation]\n", + "def clean_punc(punc, text):\n", + " for i in punc:\n", + " text=text.replace(i, \"\")\n", + " return text\n", + "\n", + "tweet=clean_punc(punc, tweet)\n", + "e_ending=[w for w in tweet.split() if re.search('e$', w)]\n", + "print(e_ending) # Note that \"there,\" ends in \",\"" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['!', '\"', '#', '$', '%', '&', \"'\", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\\\', ']', '^', '_', '`', '{', '|', '}', '~']\n" + ] + } + ], + "source": [ + "print(punc)" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, 'hey people')\n", + "(1, 'how are you?')\n", + "(2, 'life is good!')\n" + ] + } + ], + "source": [ + "alldata=[\"hey people\", \"how are you?\", \"life is good!\"]\n", + "for line_no, line in enumerate(alldata):\n", + " print(line_no, line)" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as', '\"', 'teachers', '\"', '.', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', \"high's\", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '\"', 'teachers', '\"', '.', 'the', 'scramble', 'to', 'survive', 'financially', ',', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', \"teachers'\", 'pomp', ',', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', ',', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students', '.', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', ',', 'i', 'immediately', 'recalled', '.', '.', '.', '.', '.', '.', '.', '.', '.', 'at', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', 'high', '.', 'a', 'classic', 'line', ':', 'inspector', ':', \"i'm\", 'here', 'to', 'sack', 'one', 'of', 'your', 'teachers', '.', 'student', ':', 'welcome', 'to', 'bromwell', 'high', '.', 'i', 'expect', 'that', 'many', 'adults', 'of', 'my', 'age', 'think', 'that', 'bromwell', 'high', 'is', 'far', 'fetched', '.', 'what', 'a', 'pity', 'that', 'it', \"isn't\", '!']\n" + ] + } + ], + "source": [ + "line=\"\"\"_*0 bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life , such as \" teachers \" . my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is \" teachers \" . the scramble to survive financially , the insightful students who can see right through their pathetic teachers' pomp , the pettiness of the whole situation , all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school , i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line : inspector : i'm here to sack one of your teachers . student : welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn't ! \"\"\"\n", + "line.split()[0]\n", + "words=line.split()[1:]\n", + "print(words)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "space=defaultdict(int)\n", + "for w in words:\n", + " space[w]=len(space)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "defaultdict(int,\n", + " {'!': 96,\n", + " '\"': 41,\n", + " ',': 74,\n", + " '.': 93,\n", + " '35': 25,\n", + " ':': 85,\n", + " 'a': 94,\n", + " 'about': 17,\n", + " 'adults': 88,\n", + " 'age': 89,\n", + " 'all': 59,\n", + " 'and': 64,\n", + " 'as': 22,\n", + " 'at': 76,\n", + " 'believe': 33,\n", + " 'bromwell': 91,\n", + " 'burn': 72,\n", + " 'can': 47,\n", + " 'cartoon': 4,\n", + " 'classic': 76,\n", + " 'closer': 38,\n", + " 'comedy': 5,\n", + " 'down': 73,\n", + " 'episode': 67,\n", + " 'expect': 86,\n", + " 'far': 91,\n", + " 'fetched': 92,\n", + " 'financially': 43,\n", + " 'here': 81,\n", + " 'high': 91,\n", + " \"high's\": 35,\n", + " 'i': 86,\n", + " \"i'm\": 80,\n", + " 'immediately': 74,\n", + " 'in': 68,\n", + " 'insightful': 44,\n", + " 'inspector': 79,\n", + " 'is': 91,\n", + " \"isn't\": 95,\n", + " 'it': 95,\n", + " 'knew': 63,\n", + " 'lead': 30,\n", + " 'life': 19,\n", + " 'line': 77,\n", + " 'many': 87,\n", + " 'me': 61,\n", + " 'much': 37,\n", + " 'my': 89,\n", + " 'of': 89,\n", + " 'one': 83,\n", + " 'other': 15,\n", + " 'pathetic': 52,\n", + " 'pettiness': 55,\n", + " 'pity': 94,\n", + " 'pomp': 54,\n", + " 'profession': 29,\n", + " 'programs': 16,\n", + " 'ran': 8,\n", + " 'reality': 39,\n", + " 'recalled': 75,\n", + " 'remind': 60,\n", + " 'repeatedly': 70,\n", + " 'right': 49,\n", + " 'sack': 82,\n", + " 'same': 11,\n", + " 'satire': 36,\n", + " 'saw': 66,\n", + " 'school': 74,\n", + " 'schools': 61,\n", + " 'scramble': 41,\n", + " 'see': 48,\n", + " 'situation': 58,\n", + " 'some': 14,\n", + " 'student': 85,\n", + " 'students': 65,\n", + " 'such': 21,\n", + " 'survive': 42,\n", + " 'teachers': 85,\n", + " \"teachers'\": 53,\n", + " 'teaching': 28,\n", + " 'than': 40,\n", + " 'that': 95,\n", + " 'the': 74,\n", + " 'their': 65,\n", + " 'think': 90,\n", + " 'through': 50,\n", + " 'time': 12,\n", + " 'to': 86,\n", + " 'tried': 71,\n", + " 'welcome': 85,\n", + " 'what': 93,\n", + " 'when': 65,\n", + " 'which': 68,\n", + " 'who': 46,\n", + " 'whole': 57,\n", + " 'years': 26,\n", + " 'your': 84})" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "space" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", + " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", + " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", + " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", + " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", + " 0. 0. 0. 0. 0. 0. 0.]\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "vec = np.zeros(len(space))\n", + "print(vec)" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 1. 1. 1. 1.\n", + " 0. 1. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 1. 0. 0. 1. 0. 1.\n", + " 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1.\n", + " 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1.\n", + " 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.\n", + " 1. 1. 1. 1. 1. 1. 1.]\n" + ] + } + ], + "source": [ + "for w in words:\n", + " vec[space[w]]=1\n", + "print(vec)" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "abc\n", + "cd\n" + ] + } + ], + "source": [ + "x=[\"a\", \"ab\", \"abc\", \"cd\", \"xxx\"]\n", + "for i in x:\n", + " if \"c\" in i:\n", + " print(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['abc', 'cd']\n" + ] + } + ], + "source": [ + "c_list=[i for i in x if \"c\" in i]\n", + "print(c_list)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/python_tutorial_part_4_numpy.ipynb b/python_tutorial_part_4_numpy.ipynb index ad848ce..84940f2 100644 --- a/python_tutorial_part_4_numpy.ipynb +++ b/python_tutorial_part_4_numpy.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 64, "metadata": { "collapsed": false }, @@ -20,7 +20,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "<type 'numpy.ndarray'>\n", + " <type 'numpy.ndarray'>\n", "a --> [2 3 4 5]\n", "b --> [5 6 7 8]\n", "a+b --> [ 7 9 11 13]\n" @@ -28,20 +28,304 @@ } ], "source": [ - "from numpy import *\n", - "#from numpy import array\n", + "# Import numpy, conventionally as \"np\"\n", "import numpy as np\n", - "a= array([2,3,4,5])\n", - "b=array((5,6,7,8))\n", + "# Numpy enables creation of N-dimensional arrays of data, or ndarrays\n", + "a=np.array([2,3,4,5])\n", + "b=np.array((5,6,7,8))\n", "print type(a)\n", "print \"a -->\", a\n", "print \"b -->\", b\n", - "print \"a+b -->\", a+b\n" + "print \"a+b -->\", a+b" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(4,)\n" + ] + } + ], + "source": [ + "# We can get the shape of the array, which is a tuple of the sizes of its dimensions\n", + "print(a.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n" + ] + } + ], + "source": [ + "z=np.zeros(10)\n", + "print(z)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(10,)\n" + ] + } + ], + "source": [ + "print(z.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1\n" + ] + } + ], + "source": [ + "print(z.ndim)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]]\n" + ] + } + ], + "source": [ + "# If we had an 2*5 ndarray, and we can intialize with \"zeros\" or \"ones\":\n", + "x=np.zeros([2, 5])\n", + "print(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(2, 5)\n", + "2\n" + ] + } + ], + "source": [ + "print(x.shape)\n", + "print(x.ndim)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0. 0. 0. 0. 0.]]\n" + ] + } + ], + "source": [ + "# Or we can initialize with a shape of 4, 9:\n", + "x=np.zeros([4, 9])\n", + "print(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(4, 9)\n", + "2\n" + ] + } + ], + "source": [ + "print(x.shape)\n", + "print(x.ndim)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[[ 1. 1. 1.]\n", + " [ 1. 1. 1.]\n", + " [ 1. 1. 1.]\n", + " [ 1. 1. 1.]]\n", + "\n", + " [[ 1. 1. 1.]\n", + " [ 1. 1. 1.]\n", + " [ 1. 1. 1.]\n", + " [ 1. 1. 1.]]]\n" + ] + } + ], + "source": [ + "# We can also create an array of > 2 dimensions\n", + "# Consider the following from the documentation of scipy: https://docs.scipy.org/doc/numpy-dev/user/quickstart.html:\n", + "\"\"\"\n", + "When you print an array, NumPy displays it in a similar way to nested lists, but with the following layout:\n", + "\n", + " the last axis is printed from left to right,\n", + " the second-to-last is printed from top to bottom,\n", + " the rest are also printed from top to bottom, with each slice separated from the next by an empty line.\n", + "\n", + "One-dimensional arrays are then printed as rows, bidimensionals as matrices and tridimensionals as lists of matrices.\n", + "\"\"\"\n", + "x=np.ones([2, 4, 3])\n", + "print(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(2, 4, 3)\n", + "3\n" + ] + } + ], + "source": [ + "print(x.shape)\n", + "print(x.ndim)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[[[ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]]\n", + "\n", + " [[ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]]\n", + "\n", + " [[ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]]]\n", + "\n", + "\n", + " [[[ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]]\n", + "\n", + " [[ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]]\n", + "\n", + " [[ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]\n", + " [ 0. 0. 0. 0. 0.]]]]\n" + ] + } + ], + "source": [ + "x=np.zeros([2, 3, 4, 5])\n", + "print(x)" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 40, "metadata": { "collapsed": false }, @@ -50,32 +334,168 @@ "name": "stdout", "output_type": "stream", "text": [ - "This will give an error!!!\n", - "a+c -->" + "(2, 3, 4, 5)\n", + "4\n" ] - }, + } + ], + "source": [ + "print(x.shape)\n", + "print(x.ndim)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [ { - "ename": "ValueError", - "evalue": "operands could not be broadcast together with shapes (4,) (6,) ", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m<ipython-input-4-5f9c99476f2e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mc\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m8\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m8\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m9\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mprint\u001b[0m \u001b[0;34m\"This will give an error!!!\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0;32mprint\u001b[0m \u001b[0;34m\"a+c -->\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ma\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mValueError\u001b[0m: operands could not be broadcast together with shapes (4,) (6,) " + "name": "stdout", + "output_type": "stream", + "text": [ + "float64\n" + ] + } + ], + "source": [ + "print(x.dtype)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[[1 1 1]\n", + " [1 1 1]\n", + " [1 1 1]\n", + " [1 1 1]]\n", + "\n", + " [[1 1 1]\n", + " [1 1 1]\n", + " [1 1 1]\n", + " [1 1 1]]]\n", + "int32\n" + ] + } + ], + "source": [ + "# Note array data type...\n", + "x=np.ones([2, 4, 3], dtype=np.int32)\n", + "print(x)\n", + "print(x.dtype)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Operations on arrays:" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[[5 5 5]\n", + " [5 5 5]\n", + " [5 5 5]\n", + " [5 5 5]]\n", + "\n", + " [[5 5 5]\n", + " [5 5 5]\n", + " [5 5 5]\n", + " [5 5 5]]]\n" ] } ], + "source": [ + "print(x*5)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[[ 0.2 0.2 0.2]\n", + " [ 0.2 0.2 0.2]\n", + " [ 0.2 0.2 0.2]\n", + " [ 0.2 0.2 0.2]]\n", + "\n", + " [[ 0.2 0.2 0.2]\n", + " [ 0.2 0.2 0.2]\n", + " [ 0.2 0.2 0.2]\n", + " [ 0.2 0.2 0.2]]]\n" + ] + } + ], + "source": [ + "print(x/5.0)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[[1 1 1]\n", + " [1 1 1]\n", + " [1 1 1]\n", + " [1 1 1]]]\n" + ] + } + ], + "source": [ + "# We can slice\n", + "my_slice=x[1:2]\n", + "print(my_slice)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], "source": [ "# You can only add arrays of the same shape / equal length:\n", - "c=array([5,8,8,9,5,2])\n", - "print \"This will give an error!!!\"\n", - "print \"a+c -->\", a+c" + "c=np.array([5,8,8,9,5,2])\n", + "# print \"This will give an error if you print it!!!\"\n", + "# print \"a+c -->\", a+c" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 51, "metadata": { "collapsed": false }, @@ -84,20 +504,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "a+1 --> [3 4 5 6]\n" + "a --> [2 3 4 5]\n", + "a+1 --> [4 5 6 7]\n" ] } ], "source": [ "# broadcasting\n", "# If you add an array to a scalar, the scalar gets broadcast across all the array elements\n", - "print \"a+1 -->\", a+1\n", + "print \"a -->\", a\n", + "print \"a+1 -->\", a+2\n", "# Now you can broadcast arrays and so you can add arrays of different shapes..." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 52, "metadata": { "collapsed": false }, @@ -116,16 +538,22 @@ } ], "source": [ - "import numpy as np\n", "x= np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=np.float32)\n", "print \"Printing array x: \", x,\"\\n\"\n", "print \"\\\"Shape of array x is:\\\" \", x.shape,\"\\n\"\n", "print \"\\\"Value at x[0][1] is:\\\" \", x[0][1] # gives row0, c1 --> we start index from zero!" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## More operations" + ] + }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 53, "metadata": { "collapsed": false }, @@ -149,7 +577,43 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 55, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 1 6 15 6]\n" + ] + } + ], + "source": [ + "print(x*y)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# You cannot do the below:\n", + "# You will get an error:\n", + "# ValueError: operands could not be broadcast together with shapes (4,) (5,)\n", + "x=np.array([1, 3, 5, 6])\n", + "y=np.array([1,2,3,1, 9])\n", + "d=y[1:]-y[:-1]\n", + "print(x*y)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, "metadata": { "collapsed": false }, @@ -166,12 +630,12 @@ "source": [ "print sum(a)\n", "# cumsum adds every emelement to the previous element\n", - "print cumsum(a)" + "print np.cumsum(a)" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 61, "metadata": { "collapsed": false }, @@ -186,14 +650,12 @@ "-------------------\n", "[2 3 4 5 6]\n", "-------------------\n", - "[2 4 6]\n", - "-------------------\n", - "[ 100. 215.443469 464.15888336 1000. ]\n" + "[ 2 7 12 17 22 27 32 37 42 47]\n", + "-------------------\n" ] } ], "source": [ - "import numpy as np\n", "#numpy.arange: http://docs.scipy.org/doc/numpy/reference/generated/numpy.arange.html\n", "\"\"\"\n", "numpy.arange([start, ]stop, [step, ]dtype=None)\n", @@ -208,7 +670,7 @@ "print \"-------------------\"\n", "print np.arange(2,7)\n", "print \"-------------------\"\n", - "print np.arange(2,7, 2)\n", + "print np.arange(2,50, 5)\n", "print \"-------------------\"" ] }, @@ -255,7 +717,6 @@ } ], "source": [ - "import numpy as np\n", "#------------------\n", "print \"numpy.zeros\"\n", "#------------------\n", @@ -336,12 +797,11 @@ } ], "source": [ - "import numpy as np\n", "#------------------\n", "print \"\\n numpy.linspace\"\n", "#------------------\n", "\"\"\"\n", - " numpy.linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None)[source]¶\n", + " numpy.linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None)[source]\n", " Return evenly spaced numbers over a specified interval.\n", " Returns num evenly spaced samples, calculated over the interval [start, stop].\n", " The endpoint of the interval can optionally be excluded.\n", @@ -392,7 +852,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.10" + "version": "2.7.12" } }, "nbformat": 4, diff --git a/python_tutorial_part_6_vector_space.ipynb b/python_tutorial_part_6_vector_space.ipynb index 21c002e..7bb6c5a 100644 --- a/python_tutorial_part_6_vector_space.ipynb +++ b/python_tutorial_part_6_vector_space.ipynb @@ -18,12 +18,20 @@ "outputs": [], "source": [ "# This is code to build a vector space model, with SVMs on Andrew Mass' \n", - "# distribution of movie review sentiment data." + "# distribution of movie review sentiment data.\n", + "# Since we use Python's namedtuple on the code, let's take a look at what a namedtuple is first" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## namedtuple" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": { "collapsed": false }, @@ -32,37 +40,297 @@ "name": "stdout", "output_type": "stream", "text": [ - "25000\n", - "200\n", - "200\n" + "female\n", + "Visual Arts\n" ] } ], "source": [ + "# While Python tuples is indexed numerically (like a list), a named tuple assigns names to fields and \n", + "# is also indexed numerically. This makes it possible to access the fields in a named tuple using these names\n", + "# as if they were attributes of an object (via dotting into the namedtuple)\n", + "# See also here: https://docs.python.org/2/library/collections.html\n", "from collections import namedtuple\n", + "Student = namedtuple(\"Student\", [\"name\", \"age\", \"gender\", \"course\"])\n", + "#--------------------------------------------------------------------\n", + "# Note: You can also provide field names as a space-delimited string, rather than a list.\n", + "#Student = namedtuple(\"Student\", \"name age gender course\")\n", + "#--------------------------------------------------------------------\n", "\n", - "all_data = [] \n", - "DataDoc= namedtuple('DataDoc', 'tag words')\n", - "with open('/Users/mam/CORE/RESEARCH/DEEPLEARNING/Doc2Vec/data/aclImdb/alldata-id.txt') as alldata:\n", - " for line_no, line in enumerate(alldata):\n", - " label=line.split()[0]\n", - " word_list=line.lower().split()[1:]\n", - " all_data.append(DataDoc(label, word_list))\n", - " #print my_data[line_no]\n", - " #break\n", - "train_data = all_data[:25000]\n", - "test_data = all_data[25000:50000]\n", - "print len(train_data)\n", + "angela=Student(name=\"Angela\", age=45, gender=\"female\", course=\"Python\")\n", + "soha=Student(name=\"Soha\", age=25, gender=\"female\", course=\"Visual Arts\")\n", + "print(angela.gender)\n", + "print(soha.course)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Soha\n", + "25\n", + "female\n", + "Visual Arts\n" + ] + } + ], + "source": [ + "# A namedtuple is also iterable like a tuple\n", + "for i in soha:\n", + " print(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Visual Arts'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can access a namedtuple the same way you access a tuple or a list:\n", + "soha[-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Student(name='Angela', age=45, gender='female', course='Python')\n", + "Student(name='Soha', age=25, gender='female', course='Visual Arts')\n" + ] + } + ], + "source": [ + "# We can now create a list where we append the two namedtuples above.\n", + "# i.e., a list of namedtuples\n", + "all_students=[]\n", + "all_students.append(angela)\n", + "all_students.append(soha)\n", + "for s in all_students:\n", + " print(s)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "- Angela is 45 years old.\n", + "- Soha is 25 years old.\n" + ] + } + ], + "source": [ + "for s in all_students:\n", + " print(\"- {} is {} years old.\").format(s.name, s.age)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# We should usually get tags automatically based on input data file.\n", + "# In the input data file we have, we know that the first 12500 data points are positive/1.0 and the next 12500 are\n", + "# negative/0.0 then the next 12500 is poitive and the fourth chunk is negative.\n", + "# So basically the train_data has 25K (with the first half positive and the second half negative)\n", + "# and test_data with the same setup for class label. \n", + "# The rest of the data in the file is unknown/neutral/-1 and we don't use that part.\n", + "#------------------------------------------\n", + "# Format of the data is as below, with each line starting with an index.\n", + "# For example, \"_*0\" is the index in the first line. We will ignore the \"_*\" part and cast the index into\n", + "# an int\n", + "#------------------------------------------\n", + "\"\"\"\n", + "_*0 bromwell high is a cartoon comedy ....\n", + "_*1 homelessness ( or houselessness as george carlin stated )...\n", + "_*2 brilliant over-acting by lesley ann warren .\n", + "\"\"\"\n", + "#------------------------------------------\n", + "# Let's build a function that takes the index in the file and returns a numerical index that can be seen \n", + "# by the classifier we will use later\n", + "#------------------------------------------\n", "\n", - "train_data=train_data[:100]+train_data[12500:12600]\n", - "test_data=test_data[:100]+test_data[12500:12600]\n", - "print len(train_data)\n", - "print len(test_data)" + "def map_tags(post_index):\n", + " # if post is positive, tag=1, if it is negative tag=0, if it is neutral, tag=-1\n", + " tag=-1\n", + " if post_index < 12500:\n", + " tag=1\n", + " elif post_index < 25000:\n", + " tag=0\n", + " elif post_index < 37500:\n", + " tag=1\n", + " else:\n", + " pass\n", + " return tag" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, 'bromwell high is a cartoon comedy ....')\n", + "(1, 'homelessness ( or houselessness as george carlin stated )')\n", + "(2, 'brilliant over-acting by lesley ann warren')\n" + ] + } + ], + "source": [ + "l=[\"bromwell high is a cartoon comedy ....\", \\\n", + " \"homelessness ( or houselessness as george carlin stated )\",\\\n", + " \"brilliant over-acting by lesley ann warren\"]\n", + "\n", + "for no, post in enumerate(l):\n", + " print(no, post)" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<type 'int'>\n" + ] + } + ], + "source": [ + "line=\"_*0 bromwell high is a cartoon comedy ....\"\n", + "label= int(line.split()[0].split(\"*\")[-1])\n", + "print(type(label))" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "50000\n", + "**************************************************\n", + "DataDoc(tag=1, words=['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as', '\"', 'teachers', '\"', '.', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', \"high's\", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '\"', 'teachers', '\"', '.', 'the', 'scramble', 'to', 'survive', 'financially', ',', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', \"teachers'\", 'pomp', ',', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', ',', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students', '.', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', ',', 'i', 'immediately', 'recalled', '.', '.', '.', '.', '.', '.', '.', '.', '.', 'at', '.', '.', '.', '.', '.', '.', '.', '.', '.', '.', 'high', '.', 'a', 'classic', 'line', ':', 'inspector', ':', \"i'm\", 'here', 'to', 'sack', 'one', 'of', 'your', 'teachers', '.', 'student', ':', 'welcome', 'to', 'bromwell', 'high', '.', 'i', 'expect', 'that', 'many', 'adults', 'of', 'my', 'age', 'think', 'that', 'bromwell', 'high', 'is', 'far', 'fetched', '.', 'what', 'a', 'pity', 'that', 'it', \"isn't\", '!'])\n", + "**************************************************\n" + ] + } + ], + "source": [ + "from collections import namedtuple\n", + "\n", + "def get_all_data():\n", + " \"\"\"\n", + " Returns a list of namedtuples from the IMDB file.\n", + " Each namedtuple has two named fields:\n", + " tag= class label (0 for \"negative\" and 1 for \"positive\")\n", + " word_list the list of words in the review\n", + " \"\"\"\n", + " # a list to house all the data\n", + " all_data = [] \n", + " \n", + " DataDoc= namedtuple('DataDoc', ['tag', 'words'])\n", + " with open('/Users/mam/CORE/RESEARCH/DEEPLEARNING/Doc2Vec/data/aclImdb/alldata-id.txt') as alldata:\n", + " for line_no, line in enumerate(alldata):\n", + " #post_index=int(line.split()[0].split(\"*\")[-1])\n", + " label=map_tags(line_no)\n", + " word_list=line.lower().split()[1:]\n", + " all_data.append(DataDoc(label, word_list))\n", + " return all_data\n", + "\n", + "# Call the function to get the data\n", + "all_data= get_all_data()\n", + "# The data are 100K reviews as explained earlier\n", + "# Since the last 50K are unknown, let's throw them away\n", + "all_data=all_data[:50000]\n", + "print(len(all_data))\n", + "print(\"*\"*50)\n", + "# print the first namedtuple\n", + "print(all_data[0])\n", + "print(\"*\"*50)\n", + "# print the last namedtuple\n", + "#print(all_data[-1])" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "500\n" + ] + } + ], + "source": [ + "# The data set is big, and we want to only work with a very small sample of it.\n", + "# Let's randomize the reviews and then take only 500 of them and call them train_data.\n", + "# We will then do cross-validation on these later.\n", + "from random import shuffle\n", + "shuffle(all_data)\n", + "#-------------------------\n", + "train_data = all_data[:500]\n", + "#------------------------\n", + "print len(train_data)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 81, "metadata": { "collapsed": false }, @@ -71,8 +339,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "7142\n", - "6994\n" + "13848\n", + "13828\n" ] } ], @@ -95,7 +363,7 @@ " for w in doc.words:\n", " # indexes of words won't be in sequential order as they occur in data (can you tell why?), \n", " # but that doesn't matter.\n", - " word_space[w]=len(word_space)\n", + " word_space[w]=len(word_space+1)\n", " return word_space\n", "\n", "word_space=get_space(train_data)\n", @@ -105,7 +373,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 103, "metadata": { "collapsed": false }, @@ -114,15 +382,109 @@ "name": "stdout", "output_type": "stream", "text": [ - "0\n", - "200\n", - "200\n" + "[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", + "[ 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]\n" ] } ], "source": [ "import numpy as np\n", - "\n", + "x=np.zeros(10)\n", + "print(x)\n", + "x[3]=1\n", + "print(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0. 0. 0. ..., 0. 0. 0.]\n" + ] + } + ], + "source": [ + "big=np.zeros(len(word_space))\n", + "print(big)" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "13828\n" + ] + } + ], + "source": [ + "w=\"love\"\n", + "word_index=word_space[w]\n", + "print(word_index)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "big[13828]=1" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[10, 20, 33, 44, 2, 6]\n" + ] + } + ], + "source": [ + "numbers=[10, 20, 33, 44, 50, 2, 6, 77]\n", + "less_than_fifty= [i for i in numbers if i < 50]\n", + "print(less_than_fifty)" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n", + "[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n" + ] + } + ], + "source": [ "def get_sparse_vec(data_point, space):\n", " # create empty vector\n", " sparse_vec = np.zeros((len(space)))\n", @@ -130,7 +492,8 @@ " # use exception handling such that this function can also be used to vectorize \n", " # data with words not in train (i.e., test and dev data)\n", " try:\n", - " sparse_vec[space[w]]=1\n", + " word_index= space[w]\n", + " sparse_vec[word_index]=1\n", " except:\n", " continue\n", " return sparse_vec\n", @@ -138,61 +501,25 @@ " \n", "\n", "train_vecs= [get_sparse_vec(data_point, word_space) for data_point in train_data]\n", - "test_vecs= [get_sparse_vec(data_point, word_space) for data_point in test_data]\n", - "#test_vecs= get_sparse_vectors(test_data, word_space)\n", - "\n", - "#print train_vecs, test_vecs[0]\n", - "print len(train_data[12500:12600])\n", - "print len(train_vecs)\n", - "print len(test_vecs)" + "# Get class labels\n", + "train_tags=[train_data[i].tag for i in range(len(train_data))]\n", + "# Let's look at the last training data point\n", + "print(train_tags[-1])\n", + "print(train_vecs[-1][:10])" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { - "collapsed": false + "collapsed": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.0 [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", - "200\n", - "200\n" - ] - } - ], - "source": [ - "# We should usually get tags automatically based on input data file.\n", - "# In the input data file we have, we know that the first 12500 data points are positive/1.0 and the next 12500 are\n", - "# negative/0.0 then the next 12500 is poitive and the fourth chunk is negative.\n", - "# So basically the train_data has 25K (with the first half positive and the second half negative)\n", - "# and test_data with the same setup for class label. \n", - "# The rest of the data in the file is unknown and we don't use that part.\n", - "# We could write code to extract label automatically and we will do this based on a standardized format we will work with\n", - "# later, for now we will hard-code the labels.\n", - "\n", - "from random import shuffle, randint\n", - "\n", - "\n", - "train_tags=[ 1.0 for i in range(100)] + [ 0.0 for i in range(100)]\n", - "test_tags=[ 1.0 for i in range(100)] + [ 0.0 for i in range(100)]\n", - "\n", - "\n", - "#train_tags=[ 1.0 for i in range(12500)] + [ 0.0 for i in range(12500)]\n", - "#test_tags=[ 1.0 for i in range(12500)] + [ 0.0 for i in range(12500)]\n", - "# Side note: If the first token in each line were the tag, we could get tags as follows:\n", - "# tags= [train_data[i].tag for i in range(len(train_data))]\n", - "print train_tags[-1], train_vecs[-1][:10]\n", - "print len(train_tags)\n", - "print len(test_tags)" - ] + "outputs": [], + "source": [] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 83, "metadata": { "collapsed": false }, @@ -201,11 +528,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "(200, 7142)\n" + "(500, 13848)\n" ] } ], "source": [ + "# scikit-learn likes to take data as numpy arrays. So, let's change our data accordingly:\n", "train_vecs=np.array(train_vecs)\n", "train_tags=np.array(train_tags)\n", "print train_vecs.shape" @@ -213,7 +541,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 84, "metadata": { "collapsed": false }, @@ -234,32 +562,25 @@ "================================================== \n", "\n", "********************\n", - "\t accuracy_score\t0.715\n", + "\t accuracy_score\t0.644\n", "********************\n", - "precision_score\t0.765432098765\n", - "recall_score\t0.62\n", + "precision_score\t0.657692307692\n", + "recall_score\t0.657692307692\n", "\n", "classification_report:\n", "\n", " precision recall f1-score support\n", "\n", - " 0.0 0.68 0.81 0.74 100\n", - " 1.0 0.77 0.62 0.69 100\n", + " 0 0.63 0.63 0.63 240\n", + " 1 0.66 0.66 0.66 260\n", "\n", - "avg / total 0.72 0.71 0.71 200\n", + "avg / total 0.64 0.64 0.64 500\n", "\n", "\n", "confusion_matrix:\n", "\n", - "[[81 19]\n", - " [38 62]]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using gpu device 0: GeForce GT 750M\n" + "[[151 89]\n", + " [ 89 171]]\n" ] } ], @@ -319,8 +640,30 @@ "print \"precision_score\\t\", metrics.precision_score(train_tags, predicted)\n", "print \"recall_score\\t\", metrics.recall_score(train_tags, predicted)\n", "print \"\\nclassification_report:\\n\\n\", metrics.classification_report(train_tags, predicted)\n", - "print \"\\nconfusion_matrix:\\n\\n\", metrics.confusion_matrix(train_tags, predicted)\n", - " \n" + "print \"\\nconfusion_matrix:\\n\\n\", metrics.confusion_matrix(train_tags, predicted)" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.52\n" + ] + } + ], + "source": [ + "# Usually, we calculate a basline as the majority class in training data.\n", + "# Here, to simplify, we just get the majority class in all the data (see support, which is the number of data points in each\n", + "# class, in the classification report above)\n", + "majority_class=260/500.0\n", + "print(majority_class)" ] } ], @@ -340,7 +683,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.10" + "version": "2.7.12" } }, "nbformat": 4, diff --git a/regular_expressions.ipynb b/regular_expressions.ipynb new file mode 100644 index 0000000..5eaf5fa --- /dev/null +++ b/regular_expressions.ipynb @@ -0,0 +1,719 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Regular Expressions in Python" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "Python provides a powerful regular expression module (re).\n", + "A regular expression is a special sequence of characters of which you can \n", + "think as rules that helps us match certain types of content \n", + "within string literals. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The backslash \"\\\" & Raw Strings" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "From your knowledge of string literals, you already know that \n", + "a backslash \"\\\" is interpreted by the Python parser as an escape\n", + "character. For example, in the following string, in order to use an internal quotes, we have to skip them by the backslash character \"\\\"." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is a string literal that has a quote \" character.\n" + ] + } + ], + "source": [ + "text= \"This is a string literal that has a quote \\\" character.\" \n", + "print(text)" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "The parser also interprets the backslash in specific ways when followed by \n", + "specific sequences of characters. For example, the parser replaces the \n", + "‘\\n’ excape sequence by a newline character." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This has a quote \" char followed by \n", + "\n", + "\n", + " three new lines!!.\n" + ] + } + ], + "source": [ + "text= \"This has a quote \\\" char followed by \\n\\n\\n three new lines!!.\" \n", + "print(text)" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "The re module itself also makes use of\n", + "backslash characters to escape special regex characters, which results in \n", + "us needing to having to escape the escape character itself at times.\n", + "This relsults in unreadable code. A good solution to this problem is to use what is known as a \"raw string\", which is simply achieved by prefixing\n", + "a string literal with the ‘r’ character (right before the opening quote of the string). When we do this, the parser will treat the string literal as is without attempting to make any internal substitutions. See the example below:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This has a quote \\\" char followed by \\n\\n\\n three new lines!!.\n" + ] + } + ], + "source": [ + "raw_text= r\"This has a quote \\\" char followed by \\n\\n\\n three new lines!!.\" \n", + "print(raw_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## And Three for the Road: match(), search(), and findall()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "<_sre.SRE_Match at 0x105bacf38>" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# match() will only find matches if they occur at the beginning of \n", + "# the searched string:\n", + "import re\n", + "text=\"apple berry orange berry\"\n", + "re.match(r'apple',text)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'apple'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#The above means there is a match and Python is returning the matching \n", + "# Object. \n", + "# We can access the matched pattern with: group(0)\n", + "my_match=re.match(r'apple',text)\n", + "my_match.group(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "None\n" + ] + } + ], + "source": [ + "# Since \"berry\" is not in the beginning of the string, there will be\n", + "# no match.\n", + "print(re.match(r'berry',text))" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<_sre.SRE_Match object at 0x106d33780>\n", + "<_sre.SRE_Match object at 0x106d33780>\n", + "<_sre.SRE_Match object at 0x106d33780>\n" + ] + } + ], + "source": [ + "# search() is like match(), excpet that it is not restricted to finding a match\n", + "# at the beginning: It will find a match anywhere in the string:\n", + "print(re.search(r'berry',text))\n", + "print(re.search(r'apple',text))\n", + "print(re.search(r'orange',text))" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'berry'" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Note that search() stops looking after it finds the first match.\n", + "# As such, even though there are wto examples of the string \"berry\",\n", + "# match() only returns one match (the first match)\n", + "my_berry_match=re.search(r'berry',text)\n", + "my_berry_match.group(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Start index: 6\n", + "End index: 11\n" + ] + } + ], + "source": [ + "# We can actually access the indexes of the matched \"berry\" string:\n", + "start=my_berry_match.start()\n", + "end=my_berry_match.end()\n", + "print(\"Start index: %s\" % start)\n", + "print(\"End index: %s\" % end)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "berry\n" + ] + } + ], + "source": [ + "print(text[6:11])" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['berry', 'berry']\n" + ] + } + ], + "source": [ + "# findall() is like search(), but is exhaustive: It finds all the matches\n", + "all_berry_matches=re.findall(r'berry',text)\n", + "print(all_berry_matches)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'berry'" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Since it returns a list of what matched, findall() does not work with\n", + "# grouping. Instead, just access each item in the returned list as \n", + "# what would have been a group \n", + "all_berry_matches[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### More on Grouping" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#hashtag1\n", + "#hashtag2\n" + ] + } + ], + "source": [ + "# We can surround certain surround certain parts of the regex in paranthese\n", + "# and access them later on via group numbers\n", + "tweet=\"This is a tweet with #hashtag1 and #hashtag2 https://cnn.com\"\n", + "my_hashtags=re.search(r'(#\\S+)\\s+\\S+\\s+(#\\S+)', tweet)\n", + "print(my_hashtags.group(1)) # whatever is in the first ()\n", + "print(my_hashtags.group(2)) # whatever is in the second ()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<_sre.SRE_Match object at 0x106be5d78>\n" + ] + } + ], + "source": [ + "print(my_hashtags)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " and \n" + ] + } + ], + "source": [ + "# We can surround certain surround certain parts of the regex in paranthese\n", + "# and access them later on via group numbers\n", + "tweet=\"This is a tweet with #hashtag1 and #hashtag2 https://cnn.com\"\n", + "my_hashtags=re.search(r'(#\\S+)(?P<my_and_group>\\s+\\S+\\s+)(#\\S+)', tweet)\n", + "print(my_hashtags.group(\"my_and_group\")) " + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "None\n" + ] + } + ], + "source": [ + "#tweet=\"This is a tweet with #hashtag1 and #hashtag2 https://cnn.com\"\n", + "tweet_modified=\"This is a tweet with #hashtag1 #hashtag2 https://cnn.com\"\n", + "\n", + "my_hashtags=re.search(r'#\\S+\\s+\\S+\\s+#\\S+', tweet_modified)\n", + "print(my_hashtags)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# (#\\S+) matches a hashtag \"#\", followed by one or more non-whitespaces\n", + "#----------------------------------------\n", + "# \\s+ matches one or more whitespaces\n", + "#----------------------------------------\n", + "# \\s+\\S+\\s+: Basically matches the \" and \" in the tweet, \n", + "# (note the preceding and following spaces)." + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('#hashtag1', '#hashtag2')\n" + ] + } + ], + "source": [ + "# groups() will return all matched groups as a tuple:\n", + "print(my_hashtags.groups())" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['#hashtag1', '#hashtag2']\n" + ] + } + ], + "source": [ + "# The pattern with search() above is useful if you specifically wanted\n", + "# a pattern that has \"hashtag+space(s)+and+space(s)+hashtag\"\n", + "# If you want just to get all hashtags in a tweet, just use \"findall\"\n", + "my_hashtags=re.findall(r'(#\\S+)', tweet)\n", + "print(my_hashtags)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['https://cnn.com']\n" + ] + } + ], + "source": [ + "my_url=re.findall(r'(https://\\S+.\\S+)', tweet)\n", + "print(my_url)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compiling for Re-Use" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['#hashtag1', '#hashtag2', 'https://cnn.com']\n" + ] + } + ], + "source": [ + "# Compile a pattern for reuse.\n", + "#------------------------------\n", + "# The \"|\" helps us match a hashtag or an URL (so if both exist,\n", + "# we capture BOTH)\n", + "p=re.compile(r'(#\\S+|https://\\S+.\\S+)')\n", + "matches=re.findall(p, tweet) # \n", + "print(matches)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sidenote: re.sub with grouping" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is a tweet with <HASHTAG> and <HASHTAG>\n" + ] + } + ], + "source": [ + "# Using the paranthes to capture a group is useful\n", + "# if you wanted to substitute\n", + "new_tweet=re.sub(r'(#\\S+)', '<HASHTAG>', tweet)\n", + "print(new_tweet)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is a tweet with <HASHTAG> and <HASHTAG>\n" + ] + } + ], + "source": [ + "new_tweet=re.sub(r'(#\\S+)', '<HASHTAG>', tweet)\n", + "print(new_tweet)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Grouping by name" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#hashtag1\n", + "#hashtag2\n" + ] + } + ], + "source": [ + "# Add ?P<name> before a pattern to group by name\n", + "my_hashtags=re.search(r'(?P<first>#\\S+)\\s+\\S+\\s+(?P<second>#\\S+)', tweet)\n", + "print(my_hashtags.group(\"first\")) # whatever is in the first ()\n", + "print(my_hashtags.group(\"second\")) # whatever is in the second ()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['interesting', 'last']" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Find all words with the character \"s\"\n", + "story=\"Samy told me an interesting story was airing on CBC last night...\"\n", + "re.findall(r'\\w+s\\w+', story)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['interesting', 'story', 'was', 'last']" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Since \\w* matches zero or more characters, we can get all words\n", + "# with \"s\" as follows:\n", + "re.findall(r'\\w*s\\w*', story)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['Samy', 'interesting', 'story', 'was', 'last']" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Well, almost! Let's ignore case with \"re.I\" to catch \"Samy\" as well.\n", + "re.findall(r'\\w*s\\w*', story, re.I)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/text_classification_Naive_Bayes.ipynb b/text_classification_Naive_Bayes.ipynb new file mode 100644 index 0000000..461e416 --- /dev/null +++ b/text_classification_Naive_Bayes.ipynb @@ -0,0 +1,250 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "# A Vector Space Model, with scikit-learn Naive Bayes" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/mam/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.\n", + " warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')\n" + ] + } + ], + "source": [ + "%matplotlib inline\n", + "import csv\n", + "import pandas\n", + "import sklearn\n", + "import numpy as np\n", + "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.cross_validation import StratifiedKFold, cross_val_score " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "50001\n" + ] + } + ], + "source": [ + "# Read the data\n", + "reviews = [line.rstrip() for line in open(\"/Users/mam/CORE/RESEARCH/DEEPLEARNING/Doc2Vec/data/aclImdb/alldata_2column.txt\")]\n", + "print(len(reviews))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\"label\",\"message\"\n", + "\"1\",\"bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life such as \"\" teachers \"\" . my 35 years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is \"\" teachers \"\" . the scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line : inspector : i'm here to sack one of your teachers . student : welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn't !\"\n" + ] + } + ], + "source": [ + "# The data have a header and we print it\n", + "print(reviews[0])\n", + "# print first data point.\n", + "# data format is each review as a line, csv\n", + "# clomun one is the sentiment tag --> 1=positive sentiment, 0=negative sentiment\n", + "# column 2 is the review\n", + "print(reviews[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>label</th>\n", + " <th>message</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>\"label\"</td>\n", + " <td>\"message\"</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>\"1\"</td>\n", + " <td>\"bromwell high is a cartoon comedy . it ran at...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>\"1\"</td>\n", + " <td>\"homelessness ( or houselessness as george car...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>\"1\"</td>\n", + " <td>\"brilliant over-acting by lesley ann warren . ...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>\"1\"</td>\n", + " <td>\"this is easily the most underrated film inn t...</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " label message\n", + "0 \"label\" \"message\"\n", + "1 \"1\" \"bromwell high is a cartoon comedy . it ran at...\n", + "2 \"1\" \"homelessness ( or houselessness as george car...\n", + "3 \"1\" \"brilliant over-acting by lesley ann warren . ...\n", + "4 \"1\" \"this is easily the most underrated film inn t..." + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Let's actually read the file again with pandas\n", + "import csv\n", + "import pandas as pd\n", + "reviews = pd.read_csv(\"/Users/mam/CORE/RESEARCH/DEEPLEARNING/Doc2Vec/data/aclImdb/alldata_2column.txt\",\\\n", + " sep=',', quoting=csv.QUOTE_NONE, names=[\"label\", \"message\"])\n", + "\n", + "# Let's print a preview with the \"head\" command\n", + "reviews.head(n=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/mam/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:516: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=10.\n", + " % (min_labels, self.n_folds)), Warning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0.82083583 0.8122 0.8234 0.8028 0.8204 0.834 0.8218\n", + " 0.8372 0.8034 0.8082 ]\n" + ] + } + ], + "source": [ + "reviews_data=reviews[\"message\"]\n", + "reviews_tags=reviews[\"label\"]\n", + "\n", + "pipeline = Pipeline([\n", + " ('bow', CountVectorizer(analyzer='word')), # get counts of tokens\n", + " ('tfidf', TfidfTransformer()), # get tf-idf scores\n", + " ('classifier', MultinomialNB()), # train on tf-idf vectors with the Naive Bayes classifier\n", + "])\n", + "\n", + "# Do 10-fold cross validation\n", + "scores = cross_val_score(pipeline, \n", + " reviews_data, \n", + " reviews_tags, \n", + " cv=10, \n", + " scoring='accuracy',\n", + " n_jobs=-1, # use all machine cores\n", + " )\n", + "print(scores)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.818423583283\n" + ] + } + ], + "source": [ + "# Let's get average accuracy...\n", + "avg= sum(scores/10.0)\n", + "print(avg)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}