diff --git a/advanced/others/fuzzy-name-matching.ipynb b/advanced/others/fuzzy-name-matching.ipynb new file mode 100644 index 0000000..dd93a3e --- /dev/null +++ b/advanced/others/fuzzy-name-matching.ipynb @@ -0,0 +1,283 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# The Fuzz\n", + "- `theFuzz` uses the Levenshtein edit distance to calculate the degree of closeness between two strings. \n", + " - It also provides features for determining string similarity in various situations\n", + "- [Reference](https://www.datacamp.com/tutorial/fuzzy-string-python)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# !conda install thefuzz" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from thefuzz import fuzz" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## String Matching Methods\n", + "|Technique|\tDescription|\tCode Example|\n", + "|:------:|:------|:------|\n", + "|Simple Ratio|\tCalculates similarity considering the order of input strings.\t|`fuzz.ratio(name, full_name)`|\n", + "|Partial Ratio|\tFinds partial similarity by comparing the shortest string with sub-strings.|\t`fuzz.partial_ratio(name, full_name)`\n", + "|Token Sort Ratio|\tIgnores order of words in strings.|\t`fuzz.token_sort_ratio(full_name_reordered, full_name)`|\n", + "|Token Set Ratio|\tRemoves common tokens before calculating similarity.|\t`fuzz.token_set_ratio(name, full_name)`|" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Simple Ratio \n", + "- `ratio()` calculates the edit distance based on the ordering of both input strings\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Similarity score: 86\n" + ] + } + ], + "source": [ + "# Check the similarity score\n", + "name = \"Kurtis Pykes\"\n", + "full_name = \"Kurtis K D Pykes\"\n", + "\n", + "print(f\"Similarity score: {fuzz.ratio(name, full_name)}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Partial Ratio\n", + "- `partial_ratio()` seeks to find how partially similar two strings are.\n", + " - it calculates the similarity by taking the **shortest** string, which in this scenario is stored in the variable `name`, then compares it against the **sub-strings** of the same length in the longer string, which is stored in `full_name`. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Similarity score: 67\n" + ] + } + ], + "source": [ + "print(f\"Similarity score: {fuzz.partial_ratio(name, full_name)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Since order matters in partial ratio, our score dropped to 67 in this instance. \n", + "- Therefore, to get a 100% similarity match, you would have to move the \"K D\" part to the end of the string" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Partial ratio similarity score: 100\n", + "Simple ratio similarity score: 86\n" + ] + } + ], + "source": [ + "# Order matters with partial ratio\n", + "# Check the similarity score\n", + "name = \"Kurtis Pykes\"\n", + "full_name = \"Kurtis Pykes K D\" # move K D to the end \n", + "\n", + "print(f\"Partial ratio similarity score: {fuzz.partial_ratio(name, full_name)}\")\n", + "\n", + "# But order will not effect simple ratio if strings do not match\n", + "print(f\"Simple ratio similarity score: {fuzz.ratio(name, full_name)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Token Sort Ratio\n", + "- Token sort doesn’t care about what order words occur in. It accounts for similar strings that aren’t in order as expressed above" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Token sort ratio similarity score: 100\n", + "Partial ratio similarity score: 75\n", + "Simple ratio similarity score: 86\n" + ] + } + ], + "source": [ + "# Check the similarity score\n", + "full_name = \"Kurtis K D Pykes\"\n", + "full_name_reordered = \"Kurtis Pykes K D\"\n", + "\n", + "# Order does not matter for token sort ratio\n", + "print(f\"Token sort ratio similarity score: {fuzz.token_sort_ratio(full_name_reordered, full_name)}\")\n", + "\n", + "# Order matters for partial ratio\n", + "print(f\"Partial ratio similarity score: {fuzz.partial_ratio(full_name, full_name_reordered)}\")\n", + "\n", + "# Order will not effect simple ratio if strings do not match\n", + "print(f\"Simple ratio similarity score: {fuzz.ratio(name, full_name)}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- If there are words that are dissimilar words in the strings, it will negatively impact the similarity ratio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Token sort ratio similarity score: 86\n" + ] + } + ], + "source": [ + "# Check the similarity score\n", + "name = \"Kurtis Pykes\"\n", + "full_name = \"Kurtis K D Pykes\" # \"Kurtis Pykes K D\"\n", + "\n", + "print(f\"Token sort ratio similarity score: {fuzz.token_sort_ratio(name, full_name)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Token set ratio\n", + "- The `token_set_ratio()` method is pretty similar to the token_sort_ratio(), except it takes out common tokens before calculating how similar the strings are: this is extremely helpful when the strings are significantly different in length. " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Token sort ratio similarity score: 100\n" + ] + } + ], + "source": [ + "# Check the similarity score\n", + "name = \"Kurtis Pykes\"\n", + "full_name = \"Kurtis K D Pykes\"\n", + "\n", + "print(f\"Token sort ratio similarity score: {fuzz.token_set_ratio(name, full_name)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Process\n", + "- The process module enables users to extract text from a collection using fuzzy string matching. Calling the extract() method on the process module returns the strings with a similarity score in a vector. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('barcelona fc', 86), ('AFC Barcelona', 82)]\n" + ] + } + ], + "source": [ + "from thefuzz import process\n", + "\n", + "collection = [\"AFC Barcelona\", \"Barcelona AFC\", \"barcelona fc\", \"afc barcalona\"]\n", + "print(process.extract(\"barcelona\", collection, scorer=fuzz.ratio, limit=2))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python_tutorial", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/basics/notebooks/pandas-pivot-melt-crosstab.ipynb b/basics/notebooks/pandas-pivot-melt-crosstab.ipynb new file mode 100644 index 0000000..8c91038 --- /dev/null +++ b/basics/notebooks/pandas-pivot-melt-crosstab.ipynb @@ -0,0 +1,941 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Reshaping Dataframes" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Pivot\n", + "- In pandas, there are two methods `.pivot()` and `.pivot_table()` (RECOMMENDED)\n", + "- However, `.pivot()` unable to handle duplicate values in the index column, in this case, the index column is `cusid` which contains multiple rows of `cusid=1`, and `cusid=2`" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cusidpayment_methodmerchanttotal_txn
01DEBITSHOPEE1
11DEBITGRAB2
21CREDITSHOPEE3
32CREDITSHOPEE4
42CREDITLAZADA5
52DEBITGSM6
\n", + "
" + ], + "text/plain": [ + " cusid payment_method merchant total_txn\n", + "0 1 DEBIT SHOPEE 1\n", + "1 1 DEBIT GRAB 2\n", + "2 1 CREDIT SHOPEE 3\n", + "3 2 CREDIT SHOPEE 4\n", + "4 2 CREDIT LAZADA 5\n", + "5 2 DEBIT GSM 6" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame({'cusid': [1,1,1,2,2,2],\n", + " 'payment_method': ['DEBIT', 'DEBIT', 'CREDIT', 'CREDIT', 'CREDIT', 'DEBIT'],\n", + " 'merchant': ['SHOPEE', 'GRAB', 'SHOPEE', 'SHOPEE', 'LAZADA', 'GSM'],\n", + " 'total_txn': [1, 2, 3, 4, 5, 6],\n", + "})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
payment_methodCREDITDEBIT
merchantLAZADASHOPEEGRABGSMSHOPEE
cusid
1NaN3.02.0NaN1.0
25.04.0NaN6.0NaN
\n", + "
" + ], + "text/plain": [ + "payment_method CREDIT DEBIT \n", + "merchant LAZADA SHOPEE GRAB GSM SHOPEE\n", + "cusid \n", + "1 NaN 3.0 2.0 NaN 1.0\n", + "2 5.0 4.0 NaN 6.0 NaN" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pivot_df = df.pivot_table(index=[\"cusid\"], columns=[\"payment_method\", \"merchant\"], values=[\"total_txn\"])\n", + "pivot_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "MultiIndex([('CREDIT', 'LAZADA'),\n", + " ('CREDIT', 'SHOPEE'),\n", + " ( 'DEBIT', 'GRAB'),\n", + " ( 'DEBIT', 'GSM'),\n", + " ( 'DEBIT', 'SHOPEE')],\n", + " names=['payment_method', 'merchant'])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# MultiIndex\n", + "pivot_df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CREDIT_LAZADACREDIT_SHOPEEDEBIT_GRABDEBIT_GSMDEBIT_SHOPEE
cusid
1NaN3.02.0NaN1.0
25.04.0NaN6.0NaN
\n", + "
" + ], + "text/plain": [ + " CREDIT_LAZADA CREDIT_SHOPEE DEBIT_GRAB DEBIT_GSM DEBIT_SHOPEE\n", + "cusid \n", + "1 NaN 3.0 2.0 NaN 1.0\n", + "2 5.0 4.0 NaN 6.0 NaN" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "flatten_name_df = pivot_df.copy()\n", + "flatten_name_df.columns = list(map(\"_\".join, pivot_df.columns))\n", + "flatten_name_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Melt (Unpivot)\n", + "- Unpivot a DataFrame from **wide** to **long** format, optionally leaving identifiers set.\n", + "- For example, we want to melt the dataframe `df` below into `subjects` and `grades` for each student instead of having multiple subjects columns" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameMathEnglishAge
0BobA+C13
1JohnBB16
2FooAB16
3BarFA+15
4AlexDF15
5TomCA13
\n", + "
" + ], + "text/plain": [ + " Name Math English Age\n", + "0 Bob A+ C 13\n", + "1 John B B 16\n", + "2 Foo A B 16\n", + "3 Bar F A+ 15\n", + "4 Alex D F 15\n", + "5 Tom C A 13" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame({'Name': ['Bob', 'John', 'Foo', 'Bar', 'Alex', 'Tom'],\n", + " 'Math': ['A+', 'B', 'A', 'F', 'D', 'C'],\n", + " 'English': ['C', 'B', 'B', 'A+', 'F', 'A'],\n", + " 'Age': [13, 16, 16, 15, 15, 13]})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameAgeSubjectGrades
4Alex15MathD
10Alex15EnglishF
3Bar15MathF
9Bar15EnglishA+
0Bob13MathA+
6Bob13EnglishC
2Foo16MathA
8Foo16EnglishB
1John16MathB
7John16EnglishB
5Tom13MathC
11Tom13EnglishA
\n", + "
" + ], + "text/plain": [ + " Name Age Subject Grades\n", + "4 Alex 15 Math D\n", + "10 Alex 15 English F\n", + "3 Bar 15 Math F\n", + "9 Bar 15 English A+\n", + "0 Bob 13 Math A+\n", + "6 Bob 13 English C\n", + "2 Foo 16 Math A\n", + "8 Foo 16 English B\n", + "1 John 16 Math B\n", + "7 John 16 English B\n", + "5 Tom 13 Math C\n", + "11 Tom 13 English A" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.melt(\n", + " id_vars=[\"Name\", \"Age\"],\n", + " value_vars=[\"Math\", \"English\"],\n", + " var_name=\"Subject\",\n", + " value_name=\"Grades\",\n", + ").sort_values(by=[\"Name\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Crosstab\n", + "- Crosstab: displays the relationship between two or more categorical variables by showing the frequency of different combinations of those variables" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GenderEducationAge
0MaleGraduate27
1FemaleUndergraduate18
2FemaleUndergraduate19
3MaleGraduate24
4MaleGraduate29
5FemaleGraduate23
6MaleUndergraduate18
\n", + "
" + ], + "text/plain": [ + " Gender Education Age\n", + "0 Male Graduate 27\n", + "1 Female Undergraduate 18\n", + "2 Female Undergraduate 19\n", + "3 Male Graduate 24\n", + "4 Male Graduate 29\n", + "5 Female Graduate 23\n", + "6 Male Undergraduate 18" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame({'Gender': ['Male', 'Female', 'Female', 'Male', 'Male','Female', 'Male'],\n", + " 'Education': ['Graduate', 'Undergraduate', 'Undergraduate', 'Graduate', 'Graduate', 'Graduate', 'Undergraduate'],\n", + " 'Age': [27, 18, 19, 24, 29, 23,18]})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EducationGraduateUndergraduate
Gender
Female02
Male30
\n", + "
" + ], + "text/plain": [ + "Education Graduate Undergraduate\n", + "Gender \n", + "Female 0 2\n", + "Male 3 0" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Basic crosstab\n", + "cross_tab = pd.crosstab(df['Gender'], df['Education'])\n", + "cross_tab" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EducationGraduateUndergraduate
Gender
Female0.1428570.285714
Male0.4285710.142857
\n", + "
" + ], + "text/plain": [ + "Education Graduate Undergraduate\n", + "Gender \n", + "Female 0.142857 0.285714\n", + "Male 0.428571 0.142857" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Crosstab with normalization: shows the proportion of each combination relative to the total.\n", + "cross_tab_normalized = pd.crosstab(df['Gender'], df['Education'], normalize='all')\n", + "cross_tab_normalized" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
EducationGraduateUndergraduate
Gender
Female23.00000018.5
Male26.66666718.0
\n", + "
" + ], + "text/plain": [ + "Education Graduate Undergraduate\n", + "Gender \n", + "Female 23.000000 18.5\n", + "Male 26.666667 18.0" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Crosstab with aggregation for each combination\n", + "cross_tab_agg = pd.crosstab(df['Gender'], df['Education'], values=df['Age'], aggfunc='mean')\n", + "cross_tab_agg" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "# Crosstab with margins\n", + "cross_tab_margins = pd.crosstab(df['Gender'], df['Education'], margins=True, margins_name=\"Total\")\n", + "print(\"\\nCrosstab with Margins:\")\n", + "print(cross_tab_margins)\n", + "\n", + "\n", + "print(\"\\nCrosstab with Normalization:\")\n", + "print(cross_tab_normalized)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ml_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/basics/notebooks/subprocess.ipynb b/basics/notebooks/subprocess.ipynb index 180151e..1461054 100644 --- a/basics/notebooks/subprocess.ipynb +++ b/basics/notebooks/subprocess.ipynb @@ -244,13 +244,6 @@ "\n", "print(result.stdout.decode()) # decode() to convert from bytes to strings\n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": {