{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "(00:pandas_tutorial)=\n", "# Pandas" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%config InlineBackend.figure_format = 'retina'\n", "import numpy as np\n", "import pandas as pd\n", "from matplotlib import pyplot as plt\n", "from matplotlib import rcParams\n", "\n", "# We need to do it in a separate cell. See:\n", "# https://github.com/jupyter/notebook/issues/3385\n", "plt.style.use('default')\n", "rcParams.update({'font.size':12})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Basic Usage" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Iris data is one of the typical statistical sample used by statisticians. See [this wikipedia](https://en.wikipedia.org/wiki/Iris_flower_data_set)\n", "![](https://upload.wikimedia.org/wikipedia/commons/thumb/7/7f/Mature_flower_diagram.svg/1920px-Mature_flower_diagram.svg.png)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "type(iris) = \n" ] } ], "source": [ "iris = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')\n", "print(f\"{type(iris) = }\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
05.13.51.40.2setosa
14.93.01.40.2setosa
24.73.21.30.2setosa
34.63.11.50.2setosa
45.03.61.40.2setosa
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species\n", "0 5.1 3.5 1.4 0.2 setosa\n", "1 4.9 3.0 1.4 0.2 setosa\n", "2 4.7 3.2 1.3 0.2 setosa\n", "3 4.6 3.1 1.5 0.2 setosa\n", "4 5.0 3.6 1.4 0.2 setosa" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
1446.73.35.72.5virginica
1456.73.05.22.3virginica
1466.32.55.01.9virginica
1476.53.05.22.0virginica
1486.23.45.42.3virginica
1495.93.05.11.8virginica
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species\n", "144 6.7 3.3 5.7 2.5 virginica\n", "145 6.7 3.0 5.2 2.3 virginica\n", "146 6.3 2.5 5.0 1.9 virginica\n", "147 6.5 3.0 5.2 2.0 virginica\n", "148 6.2 3.4 5.4 2.3 virginica\n", "149 5.9 3.0 5.1 1.8 virginica" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.tail(6)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_width
count150.000000150.000000150.000000150.000000
mean5.8433333.0573333.7580001.199333
std0.8280660.4358661.7652980.762238
min4.3000002.0000001.0000000.100000
25%5.1000002.8000001.6000000.300000
50%5.8000003.0000004.3500001.300000
75%6.4000003.3000005.1000001.800000
max7.9000004.4000006.9000002.500000
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width\n", "count 150.000000 150.000000 150.000000 150.000000\n", "mean 5.843333 3.057333 3.758000 1.199333\n", "std 0.828066 0.435866 1.765298 0.762238\n", "min 4.300000 2.000000 1.000000 0.100000\n", "25% 5.100000 2.800000 1.600000 0.300000\n", "50% 5.800000 3.000000 4.350000 1.300000\n", "75% 6.400000 3.300000 5.100000 1.800000\n", "max 7.900000 4.400000 6.900000 2.500000" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can save by ``.to_csv()``, print to html by ``to_html()``, convert to numpy by ``.to_numpy()``, etc:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# iris.to_csv(\"iris_data.csv\")\n", "# iris.to_html()\n", "# iris.to_numpy()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Selection of Columns and Rows" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If you select a column or a row, it now becomes ``pandas.Series``, not ``DataFrame``:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "text/plain": [ "0 5.1\n", "1 4.9\n", "2 4.7\n", "3 4.6\n", "4 5.0\n", "Name: sepal_length, dtype: float64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sep_len = iris[\"sepal_length\"]\n", "\n", "print(type(sep_len))\n", "sep_len.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Convert to numpy:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([5.1, 4.9, 4.7, 4.6, 5. , 5.4, 4.6, 5. , 4.4, 4.9, 5.4, 4.8, 4.8,\n", " 4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1, 5.4, 5.1, 4.6, 5.1, 4.8, 5. ,\n", " 5. , 5.2, 5.2, 4.7, 4.8, 5.4, 5.2, 5.5, 4.9, 5. , 5.5, 4.9, 4.4,\n", " 5.1, 5. , 4.5, 4.4, 5. , 5.1, 4.8, 5.1, 4.6, 5.3, 5. , 7. , 6.4,\n", " 6.9, 5.5, 6.5, 5.7, 6.3, 4.9, 6.6, 5.2, 5. , 5.9, 6. , 6.1, 5.6,\n", " 6.7, 5.6, 5.8, 6.2, 5.6, 5.9, 6.1, 6.3, 6.1, 6.4, 6.6, 6.8, 6.7,\n", " 6. , 5.7, 5.5, 5.5, 5.8, 6. , 5.4, 6. , 6.7, 6.3, 5.6, 5.5, 5.5,\n", " 6.1, 5.8, 5. , 5.6, 5.7, 5.7, 6.2, 5.1, 5.7, 6.3, 5.8, 7.1, 6.3,\n", " 6.5, 7.6, 4.9, 7.3, 6.7, 7.2, 6.5, 6.4, 6.8, 5.7, 5.8, 6.4, 6.5,\n", " 7.7, 7.7, 6. , 6.9, 5.6, 7.7, 6.3, 6.7, 7.2, 6.2, 6.1, 6.4, 7.2,\n", " 7.4, 7.9, 6.4, 6.3, 6.1, 7.7, 6.3, 6.4, 6. , 6.9, 6.7, 6.9, 5.8,\n", " 6.8, 6.7, 6.7, 6.3, 6.5, 6.2, 5.9])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sep_len.to_numpy() \n", "# Identical to sep_len.values" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Selecting Elements (``.loc`` and ``.iloc``)\n", "Two major ways to select elements:\n", "\n", "* ``loc`` is used when you want to use column name and row index\n", " * ``.loc[column, i]``, such as ``iris.loc[\"sepal_length\", 0]``\n", "* ``iloc`` is used when you want to use integer indexing\n", " * ``.iloc[i, j]``, such as ``iris.iloc[0, 1]`` or ``iris.iloc[-1]``" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "iris.iloc[0, 1] = 3.5\n", "iris.loc[0, 'sepal_width'] = 3.5\n" ] } ], "source": [ "# To select only one:\n", "print(f\"{iris.iloc[0, 1] = }\")\n", "print(f\"{iris.loc[0, 'sepal_width'] = }\")" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "sepal_length 5.1\n", "sepal_width 3.5\n", "petal_length 1.4\n", "petal_width 0.2\n", "species setosa\n", "Name: 0, dtype: object" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Select a row by iloc\n", "iris.iloc[0, :]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "sepal_length 5.1\n", "sepal_width 3.5\n", "petal_length 1.4\n", "petal_width 0.2\n", "species setosa\n", "Name: 0, dtype: object" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Select a row by loc:\n", "iris.loc[0]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "text/plain": [ "sepal_length 5.1\n", "sepal_width 3.5\n", "petal_length 1.4\n", "petal_width 0.2\n", "species setosa\n", "Name: 0, dtype: object" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "row_0 = iris.iloc[0]\n", "\n", "print(type(row_0))\n", "row_0" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "And when converting the `Series` obejct to `numpy.ndarray`:" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5, 1.6, 1.4,\n", " 1.1, 1.2, 1.5, 1.3, 1.4, 1.7, 1.5, 1.7, 1.5, 1. , 1.7, 1.9, 1.6,\n", " 1.6, 1.5, 1.4, 1.6, 1.6, 1.5, 1.5, 1.4, 1.5, 1.2, 1.3, 1.4, 1.3,\n", " 1.5, 1.3, 1.3, 1.3, 1.6, 1.9, 1.4, 1.6, 1.4, 1.5, 1.4, 4.7, 4.5,\n", " 4.9, 4. , 4.6, 4.5, 4.7, 3.3, 4.6, 3.9, 3.5, 4.2, 4. , 4.7, 3.6,\n", " 4.4, 4.5, 4.1, 4.5, 3.9, 4.8, 4. , 4.9, 4.7, 4.3, 4.4, 4.8, 5. ,\n", " 4.5, 3.5, 3.8, 3.7, 3.9, 5.1, 4.5, 4.5, 4.7, 4.4, 4.1, 4. , 4.4,\n", " 4.6, 4. , 3.3, 4.2, 4.2, 4.2, 4.3, 3. , 4.1, 6. , 5.1, 5.9, 5.6,\n", " 5.8, 6.6, 4.5, 6.3, 5.8, 6.1, 5.1, 5.3, 5.5, 5. , 5.1, 5.3, 5.5,\n", " 6.7, 6.9, 5. , 5.7, 4.9, 6.7, 4.9, 5.7, 6. , 4.8, 4.9, 5.6, 5.8,\n", " 6.1, 6.4, 5.6, 5.1, 5.6, 6.1, 5.6, 5.5, 4.8, 5.4, 5.6, 5.1, 5.1,\n", " 5.9, 5.7, 5.2, 5. , 5.2, 5.4, 5.1])" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris[\"petal_length\"].values" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Selecting by Conditions\n", "When making a new DataFrame using a subset of the columns of an existing DataFrame:" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
507.03.24.71.4versicolor
516.43.24.51.5versicolor
526.93.14.91.5versicolor
535.52.34.01.3versicolor
546.52.84.61.5versicolor
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species\n", "50 7.0 3.2 4.7 1.4 versicolor\n", "51 6.4 3.2 4.5 1.5 versicolor\n", "52 6.9 3.1 4.9 1.5 versicolor\n", "53 5.5 2.3 4.0 1.3 versicolor\n", "54 6.5 2.8 4.6 1.5 versicolor" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris2 = iris.loc[iris[\"species\"] == \"versicolor\"].copy()\n", "# I always recommend you to use .copy() at the end, if you don't know what it means.\n", "iris2.head()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
836.02.75.11.6versicolor
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species\n", "83 6.0 2.7 5.1 1.6 versicolor" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris2 = iris.loc[(iris[\"petal_length\"] > 5) & (iris[\"species\"] == \"versicolor\")].copy()\n", "# I always recommend you to use .copy() at the end, if you don't know what it means.\n", "iris2.head()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Note that you have to use parentheses like ``.loc[(condition1) & (condition2)]``." ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Iterate through the DataFrame\n", "\n", "In the example below, I tried to add a column named \"test\" while iterating through the rows:" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "sepal_length 5.1\n", "sepal_width 3.5\n", "petal_length 1.4\n", "petal_width 0.2\n", "species setosa\n", "Name: 0, dtype: object\n", "\n", "sepal_length 5.1\n", "sepal_width 3.5\n", "petal_length 1.4\n", "petal_width 0.2\n", "species setosa\n", "test 999\n", "Name: 0, dtype: object\n" ] } ], "source": [ "for i, row in iris.iterrows():\n", " if i == 0:\n", " print(i)\n", " print(row)\n", " row[\"test\"] = 999\n", " print()\n", " print(row)\n" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sepal_length 5.1\n", "sepal_width 3.5\n", "petal_length 1.4\n", "petal_width 0.2\n", "species setosa\n", "Name: 0, dtype: object\n" ] } ], "source": [ "print(iris.iloc[0])\n", "# Note that it is not changed!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "How do we modify them actually?\n", "\n", "A **not working** example:" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# Initialize\n", "iris[\"test\"] = None\n", "for i, row in iris.iterrows():\n", " row.loc[\"test\"] = row[\"sepal_length\"] + row[\"petal_length\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here, ``row`` is just a copy of the original DataFrame. So nothing is changed in ``iris``:" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspeciestest
05.13.51.40.2setosaNone
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species test\n", "0 5.1 3.5 1.4 0.2 setosa None" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.head(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A **working** example:\n", "\n", "You need to directly access to the original DataFrame:" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "# Initialize\n", "iris[\"test\"] = None\n", "for i, row in iris.iterrows():\n", " iris.at[i, \"test\"] = row[\"sepal_length\"] + row[\"petal_length\"]" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspeciestest
05.13.51.40.2setosa6.5
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species test\n", "0 5.1 3.5 1.4 0.2 setosa 6.5" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.head(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Things go well in this case with ``loc``, but I used ``at``.\n", "* ``loc`` is slower, but you can access to multiple locations\n", "* ``at`` is quicker, but you can access to only one single location\n", "\n", "See [here](https://stackoverflow.com/questions/37216485/pandas-at-versus-loc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A **working and good** example for this specific case:\n", "\n", "But in this kind of simple summation case, you can just do" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspeciestest
05.13.51.40.2setosa6.5
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species test\n", "0 5.1 3.5 1.4 0.2 setosa 6.5" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris[\"test\"] = iris[\"sepal_length\"] + iris[\"petal_length\"]\n", "iris.head(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**You don't even need that initialization.**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The iteration is useful when you do some complicated jobs:" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "# Initialize\n", "iris[\"test\"] = None\n", "for i, row in iris.iterrows():\n", " length_sum = row[\"sepal_length\"] + row[\"petal_length\"]\n", " if length_sum > 0.1:\n", " length_sum = 0.1\n", " iris.at[i, \"test\"] = length_sum" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Drop" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "pandas usually generate annoying index (``0, 1, ..., N``) columns etc. You can drop them by" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "ename": "KeyError", "evalue": "\"['test'] not found in axis\"", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[41], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m iris \u001b[39m=\u001b[39m iris\u001b[39m.\u001b[39;49mdrop(columns\u001b[39m=\u001b[39;49m[\u001b[39m\"\u001b[39;49m\u001b[39mtest\u001b[39;49m\u001b[39m\"\u001b[39;49m])\n\u001b[1;32m 3\u001b[0m iris\u001b[39m.\u001b[39mhead(\u001b[39m1\u001b[39m)\n", "File \u001b[0;32m~/miniforge3/lib/python3.10/site-packages/pandas/util/_decorators.py:331\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments..decorate..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 325\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(args) \u001b[39m>\u001b[39m num_allow_args:\n\u001b[1;32m 326\u001b[0m warnings\u001b[39m.\u001b[39mwarn(\n\u001b[1;32m 327\u001b[0m msg\u001b[39m.\u001b[39mformat(arguments\u001b[39m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[1;32m 328\u001b[0m \u001b[39mFutureWarning\u001b[39;00m,\n\u001b[1;32m 329\u001b[0m stacklevel\u001b[39m=\u001b[39mfind_stack_level(),\n\u001b[1;32m 330\u001b[0m )\n\u001b[0;32m--> 331\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n", "File \u001b[0;32m~/miniforge3/lib/python3.10/site-packages/pandas/core/frame.py:5399\u001b[0m, in \u001b[0;36mDataFrame.drop\u001b[0;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[1;32m 5251\u001b[0m \u001b[39m@deprecate_nonkeyword_arguments\u001b[39m(version\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, allowed_args\u001b[39m=\u001b[39m[\u001b[39m\"\u001b[39m\u001b[39mself\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mlabels\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m 5252\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mdrop\u001b[39m( \u001b[39m# type: ignore[override]\u001b[39;00m\n\u001b[1;32m 5253\u001b[0m \u001b[39mself\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 5260\u001b[0m errors: IgnoreRaise \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mraise\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 5261\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m DataFrame \u001b[39m|\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 5262\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 5263\u001b[0m \u001b[39m Drop specified labels from rows or columns.\u001b[39;00m\n\u001b[1;32m 5264\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 5397\u001b[0m \u001b[39m weight 1.0 0.8\u001b[39;00m\n\u001b[1;32m 5398\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 5399\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39msuper\u001b[39;49m()\u001b[39m.\u001b[39;49mdrop(\n\u001b[1;32m 5400\u001b[0m labels\u001b[39m=\u001b[39;49mlabels,\n\u001b[1;32m 5401\u001b[0m axis\u001b[39m=\u001b[39;49maxis,\n\u001b[1;32m 5402\u001b[0m index\u001b[39m=\u001b[39;49mindex,\n\u001b[1;32m 5403\u001b[0m columns\u001b[39m=\u001b[39;49mcolumns,\n\u001b[1;32m 5404\u001b[0m level\u001b[39m=\u001b[39;49mlevel,\n\u001b[1;32m 5405\u001b[0m inplace\u001b[39m=\u001b[39;49minplace,\n\u001b[1;32m 5406\u001b[0m errors\u001b[39m=\u001b[39;49merrors,\n\u001b[1;32m 5407\u001b[0m )\n", "File \u001b[0;32m~/miniforge3/lib/python3.10/site-packages/pandas/util/_decorators.py:331\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments..decorate..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 325\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(args) \u001b[39m>\u001b[39m num_allow_args:\n\u001b[1;32m 326\u001b[0m warnings\u001b[39m.\u001b[39mwarn(\n\u001b[1;32m 327\u001b[0m msg\u001b[39m.\u001b[39mformat(arguments\u001b[39m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[1;32m 328\u001b[0m \u001b[39mFutureWarning\u001b[39;00m,\n\u001b[1;32m 329\u001b[0m stacklevel\u001b[39m=\u001b[39mfind_stack_level(),\n\u001b[1;32m 330\u001b[0m )\n\u001b[0;32m--> 331\u001b[0m \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n", "File \u001b[0;32m~/miniforge3/lib/python3.10/site-packages/pandas/core/generic.py:4505\u001b[0m, in \u001b[0;36mNDFrame.drop\u001b[0;34m(self, labels, axis, index, columns, level, inplace, errors)\u001b[0m\n\u001b[1;32m 4503\u001b[0m \u001b[39mfor\u001b[39;00m axis, labels \u001b[39min\u001b[39;00m axes\u001b[39m.\u001b[39mitems():\n\u001b[1;32m 4504\u001b[0m \u001b[39mif\u001b[39;00m labels \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m-> 4505\u001b[0m obj \u001b[39m=\u001b[39m obj\u001b[39m.\u001b[39;49m_drop_axis(labels, axis, level\u001b[39m=\u001b[39;49mlevel, errors\u001b[39m=\u001b[39;49merrors)\n\u001b[1;32m 4507\u001b[0m \u001b[39mif\u001b[39;00m inplace:\n\u001b[1;32m 4508\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_update_inplace(obj)\n", "File \u001b[0;32m~/miniforge3/lib/python3.10/site-packages/pandas/core/generic.py:4546\u001b[0m, in \u001b[0;36mNDFrame._drop_axis\u001b[0;34m(self, labels, axis, level, errors, only_slice)\u001b[0m\n\u001b[1;32m 4544\u001b[0m new_axis \u001b[39m=\u001b[39m axis\u001b[39m.\u001b[39mdrop(labels, level\u001b[39m=\u001b[39mlevel, errors\u001b[39m=\u001b[39merrors)\n\u001b[1;32m 4545\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m-> 4546\u001b[0m new_axis \u001b[39m=\u001b[39m axis\u001b[39m.\u001b[39;49mdrop(labels, errors\u001b[39m=\u001b[39;49merrors)\n\u001b[1;32m 4547\u001b[0m indexer \u001b[39m=\u001b[39m axis\u001b[39m.\u001b[39mget_indexer(new_axis)\n\u001b[1;32m 4549\u001b[0m \u001b[39m# Case for non-unique axis\u001b[39;00m\n\u001b[1;32m 4550\u001b[0m \u001b[39melse\u001b[39;00m:\n", "File \u001b[0;32m~/miniforge3/lib/python3.10/site-packages/pandas/core/indexes/base.py:6934\u001b[0m, in \u001b[0;36mIndex.drop\u001b[0;34m(self, labels, errors)\u001b[0m\n\u001b[1;32m 6932\u001b[0m \u001b[39mif\u001b[39;00m mask\u001b[39m.\u001b[39many():\n\u001b[1;32m 6933\u001b[0m \u001b[39mif\u001b[39;00m errors \u001b[39m!=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mignore\u001b[39m\u001b[39m\"\u001b[39m:\n\u001b[0;32m-> 6934\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mKeyError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mlist\u001b[39m(labels[mask])\u001b[39m}\u001b[39;00m\u001b[39m not found in axis\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 6935\u001b[0m indexer \u001b[39m=\u001b[39m indexer[\u001b[39m~\u001b[39mmask]\n\u001b[1;32m 6936\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mdelete(indexer)\n", "\u001b[0;31mKeyError\u001b[0m: \"['test'] not found in axis\"" ] } ], "source": [ "iris = iris.drop(columns=[\"test\"])\n", "\n", "iris.head(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "NaN's can be removed by ``.dropna()``:" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
0NaNNaNNaNNaNNone
1NaNNaNNaNNaNNone
24.73.21.30.2setosa
34.63.11.50.2setosa
45.03.61.40.2setosa
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species\n", "0 NaN NaN NaN NaN None\n", "1 NaN NaN NaN NaN None\n", "2 4.7 3.2 1.3 0.2 setosa\n", "3 4.6 3.1 1.5 0.2 setosa\n", "4 5.0 3.6 1.4 0.2 setosa" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris_nan = iris.copy()\n", "iris_nan.iloc[:2] = None\n", "iris_nan.head()" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
34.63.11.50.2setosa
45.03.61.40.2setosa
55.43.91.70.4setosa
64.63.41.40.3setosa
75.03.41.50.2setosa
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species\n", "3 4.6 3.1 1.5 0.2 setosa\n", "4 5.0 3.6 1.4 0.2 setosa\n", "5 5.4 3.9 1.7 0.4 setosa\n", "6 4.6 3.4 1.4 0.3 setosa\n", "7 5.0 3.4 1.5 0.2 setosa" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris_nan = iris_nan.dropna()\n", "iris_nan.head() # try .reset_index() by yourself" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexsepal_lengthsepal_widthpetal_lengthpetal_widthspecies
034.63.11.50.2setosa
145.03.61.40.2setosa
255.43.91.70.4setosa
364.63.41.40.3setosa
475.03.41.50.2setosa
\n", "
" ], "text/plain": [ " index sepal_length sepal_width petal_length petal_width species\n", "0 3 4.6 3.1 1.5 0.2 setosa\n", "1 4 5.0 3.6 1.4 0.2 setosa\n", "2 5 5.4 3.9 1.7 0.4 setosa\n", "3 6 4.6 3.4 1.4 0.3 setosa\n", "4 7 5.0 3.4 1.5 0.2 setosa" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris_nan.reset_index().head()" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
04.63.11.50.2setosa
15.03.61.40.2setosa
25.43.91.70.4setosa
34.63.41.40.3setosa
45.03.41.50.2setosa
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species\n", "0 4.6 3.1 1.5 0.2 setosa\n", "1 5.0 3.6 1.4 0.2 setosa\n", "2 5.4 3.9 1.7 0.4 setosa\n", "3 4.6 3.4 1.4 0.3 setosa\n", "4 5.0 3.4 1.5 0.2 setosa" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris_nan.reset_index(drop=True).head()" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "# iris_nan.to_csv(\"test.csv\", index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Sorting" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
605.02.03.51.0versicolor
626.02.24.01.0versicolor
686.22.24.51.5versicolor
1196.02.25.01.5virginica
414.52.31.30.3setosa
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species\n", "60 5.0 2.0 3.5 1.0 versicolor\n", "62 6.0 2.2 4.0 1.0 versicolor\n", "68 6.2 2.2 4.5 1.5 versicolor\n", "119 6.0 2.2 5.0 1.5 virginica\n", "41 4.5 2.3 1.3 0.3 setosa" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.sort_values(by=[\"sepal_width\", \"petal_length\"]).head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As can be seen, the index numbers are kept but \"ordered\" based on the column.\n", "\n", "Reset it:" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
05.02.03.51.0versicolor
16.02.24.01.0versicolor
26.02.25.01.5virginica
36.22.24.51.5versicolor
44.52.31.30.3setosa
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species\n", "0 5.0 2.0 3.5 1.0 versicolor\n", "1 6.0 2.2 4.0 1.0 versicolor\n", "2 6.0 2.2 5.0 1.5 virginica\n", "3 6.2 2.2 4.5 1.5 versicolor\n", "4 4.5 2.3 1.3 0.3 setosa" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris_sort = (iris.sort_values(by=\"sepal_width\")\n", " .reset_index(drop=True)\n", ")\n", "iris_sort.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "See what happens if ``drop = False``." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Grouping" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Grouping is one of the most useful functionality of pandas. " ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "grouped = iris.groupby(\"species\")" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "print(type(grouped))" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
05.13.51.40.2setosa
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species\n", "0 5.1 3.5 1.4 0.2 setosa" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "grouped.get_group(\"setosa\").head(1)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "setosa\n", " sepal_length sepal_width petal_length petal_width\n", "count 50.00000 50.000000 50.000000 50.000000\n", "mean 5.00600 3.428000 1.462000 0.246000\n", "std 0.35249 0.379064 0.173664 0.105386\n", "min 4.30000 2.300000 1.000000 0.100000\n", "25% 4.80000 3.200000 1.400000 0.200000\n", "50% 5.00000 3.400000 1.500000 0.200000\n", "75% 5.20000 3.675000 1.575000 0.300000\n", "max 5.80000 4.400000 1.900000 0.600000\n", "\n", "versicolor\n", " sepal_length sepal_width petal_length petal_width\n", "count 50.000000 50.000000 50.000000 50.000000\n", "mean 5.936000 2.770000 4.260000 1.326000\n", "std 0.516171 0.313798 0.469911 0.197753\n", "min 4.900000 2.000000 3.000000 1.000000\n", "25% 5.600000 2.525000 4.000000 1.200000\n", "50% 5.900000 2.800000 4.350000 1.300000\n", "75% 6.300000 3.000000 4.600000 1.500000\n", "max 7.000000 3.400000 5.100000 1.800000\n", "\n", "virginica\n", " sepal_length sepal_width petal_length petal_width\n", "count 50.00000 50.000000 50.000000 50.00000\n", "mean 6.58800 2.974000 5.552000 2.02600\n", "std 0.63588 0.322497 0.551895 0.27465\n", "min 4.90000 2.200000 4.500000 1.40000\n", "25% 6.22500 2.800000 5.100000 1.80000\n", "50% 6.50000 3.000000 5.550000 2.00000\n", "75% 6.90000 3.175000 5.875000 2.30000\n", "max 7.90000 3.800000 6.900000 2.50000\n", "\n" ] } ], "source": [ "for name, group in grouped:\n", " print(name)\n", " print(group.describe())\n", " print()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Masking and Special Operations" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
14.93.01.40.2setosa
24.73.21.30.2setosa
34.63.11.50.2setosa
84.42.91.40.2setosa
94.93.11.50.1setosa
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species\n", "1 4.9 3.0 1.4 0.2 setosa\n", "2 4.7 3.2 1.3 0.2 setosa\n", "3 4.6 3.1 1.5 0.2 setosa\n", "8 4.4 2.9 1.4 0.2 setosa\n", "9 4.9 3.1 1.5 0.1 setosa" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mask = iris[\"sepal_width\"] < 3.3\n", "iris.loc[mask].head() # Try .reset_index() by yourself" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There are many of the things you can do for ``str``, e.g., ``.split`` or ``.replace``, etc.\n", "\n", "You can do that on all the ``str``s in a column without for loop in pandas:" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthspecies
05.13.51.40.2setosa
14.93.01.40.2setosa
24.73.21.30.2setosa
34.63.11.50.2setosa
45.03.61.40.2setosa
\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width species\n", "0 5.1 3.5 1.4 0.2 setosa\n", "1 4.9 3.0 1.4 0.2 setosa\n", "2 4.7 3.2 1.3 0.2 setosa\n", "3 4.6 3.1 1.5 0.2 setosa\n", "4 5.0 3.6 1.4 0.2 setosa" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "setosa_mask = iris[\"species\"].str.startswith(\"seto\")\n", "iris.loc[setosa_mask].head()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 testingtestingsa\n", "1 testingtestingsa\n", "2 testingtestingsa\n", "3 testingtestingsa\n", "4 testingtestingsa\n", "Name: species, dtype: object" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris[\"species\"].str.replace(\"seto\", \"testingtesting\").head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## A Simple Plotting" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": { "image/png": { "height": 487, "width": 784 } }, "output_type": "display_data" } ], "source": [ "fig, axs = plt.subplots(1, 1, figsize=(8, 5), sharex=False, sharey=False, gridspec_kw=None)\n", "\n", "for name, g in grouped:\n", " axs.plot(g[\"sepal_length\"], g[\"sepal_width\"], '.', alpha=0.5, label=name)\n", " \n", "axs.legend(loc=1)\n", "axs.grid()\n", "axs.set(xlabel=\"sepal length\", ylabel=\"sepal width\")\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.10" }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }