Pandas

4.5. Pandas#

%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import rcParams

# We need to do it in a separate cell. See:
# https://github.com/jupyter/notebook/issues/3385
plt.style.use('default')
rcParams.update({'font.size':12})

4.5.1. Basic Usage#

Iris data is one of the typical statistical sample used by statisticians. See this wikipedia

iris = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')
print(f"{type(iris) = }")

type(iris) = <class 'pandas.core.frame.DataFrame'>

iris.head()

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

iris.tail(6)

	sepal_length	sepal_width	petal_length	petal_width	species
144	6.7	3.3	5.7	2.5	virginica
145	6.7	3.0	5.2	2.3	virginica
146	6.3	2.5	5.0	1.9	virginica
147	6.5	3.0	5.2	2.0	virginica
148	6.2	3.4	5.4	2.3	virginica
149	5.9	3.0	5.1	1.8	virginica

iris.describe()

	sepal_length	sepal_width	petal_length	petal_width
count	150.000000	150.000000	150.000000	150.000000
mean	5.843333	3.057333	3.758000	1.199333
std	0.828066	0.435866	1.765298	0.762238
min	4.300000	2.000000	1.000000	0.100000
25%	5.100000	2.800000	1.600000	0.300000
50%	5.800000	3.000000	4.350000	1.300000
75%	6.400000	3.300000	5.100000	1.800000
max	7.900000	4.400000	6.900000	2.500000

You can save by .to_csv(), print to html by to_html(), convert to numpy by .to_numpy(), etc:

# iris.to_csv("iris_data.csv")
# iris.to_html()
# iris.to_numpy()

4.5.2. Selection of Columns and Rows#

If you select a column or a row, it now becomes pandas.Series, not DataFrame:

sep_len = iris["sepal_length"]

print(type(sep_len))
sep_len.head()

<class 'pandas.core.series.Series'>

  5.1
  4.9
  4.7
  4.6
  5.0
Name: sepal_length, dtype: float64

Convert to numpy:

sep_len.to_numpy()  
# Identical to sep_len.values

array([5.1, 4.9, 4.7, 4.6, 5. , 5.4, 4.6, 5. , 4.4, 4.9, 5.4, 4.8, 4.8,
       4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1, 5.4, 5.1, 4.6, 5.1, 4.8, 5. ,
       5. , 5.2, 5.2, 4.7, 4.8, 5.4, 5.2, 5.5, 4.9, 5. , 5.5, 4.9, 4.4,
       5.1, 5. , 4.5, 4.4, 5. , 5.1, 4.8, 5.1, 4.6, 5.3, 5. , 7. , 6.4,
       6.9, 5.5, 6.5, 5.7, 6.3, 4.9, 6.6, 5.2, 5. , 5.9, 6. , 6.1, 5.6,
       6.7, 5.6, 5.8, 6.2, 5.6, 5.9, 6.1, 6.3, 6.1, 6.4, 6.6, 6.8, 6.7,
       6. , 5.7, 5.5, 5.5, 5.8, 6. , 5.4, 6. , 6.7, 6.3, 5.6, 5.5, 5.5,
       6.1, 5.8, 5. , 5.6, 5.7, 5.7, 6.2, 5.1, 5.7, 6.3, 5.8, 7.1, 6.3,
       6.5, 7.6, 4.9, 7.3, 6.7, 7.2, 6.5, 6.4, 6.8, 5.7, 5.8, 6.4, 6.5,
       7.7, 7.7, 6. , 6.9, 5.6, 7.7, 6.3, 6.7, 7.2, 6.2, 6.1, 6.4, 7.2,
       7.4, 7.9, 6.4, 6.3, 6.1, 7.7, 6.3, 6.4, 6. , 6.9, 6.7, 6.9, 5.8,
       6.8, 6.7, 6.7, 6.3, 6.5, 6.2, 5.9])

4.5.3. Selecting Elements (`.loc` and `.iloc`)#

Two major ways to select elements:

loc is used when you want to use column name and row index
- .loc[column, i], such as iris.loc["sepal_length", 0]
iloc is used when you want to use integer indexing
- .iloc[i, j], such as iris.iloc[0, 1] or iris.iloc[-1]

# To select only one:
print(f"{iris.iloc[0, 1]            = }")
print(f"{iris.loc[0, 'sepal_width'] = }")

iris.iloc[0, 1]            = 3.5
iris.loc[0, 'sepal_width'] = 3.5

# Select a row by iloc
iris.iloc[0, :]

sepal_length       5.1
sepal_width        3.5
petal_length       1.4
petal_width        0.2
species         setosa
Name: 0, dtype: object

# Select a row by loc:
iris.loc[0]

sepal_length       5.1
sepal_width        3.5
petal_length       1.4
petal_width        0.2
species         setosa
Name: 0, dtype: object

row_0 = iris.iloc[0]

print(type(row_0))
row_0

<class 'pandas.core.series.Series'>

sepal_length       5.1
sepal_width        3.5
petal_length       1.4
petal_width        0.2
species         setosa
Name: 0, dtype: object

And when converting the Series obejct to numpy.ndarray:

iris["petal_length"].values

array([1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5, 1.6, 1.4,
       1.1, 1.2, 1.5, 1.3, 1.4, 1.7, 1.5, 1.7, 1.5, 1. , 1.7, 1.9, 1.6,
       1.6, 1.5, 1.4, 1.6, 1.6, 1.5, 1.5, 1.4, 1.5, 1.2, 1.3, 1.4, 1.3,
       1.5, 1.3, 1.3, 1.3, 1.6, 1.9, 1.4, 1.6, 1.4, 1.5, 1.4, 4.7, 4.5,
       4.9, 4. , 4.6, 4.5, 4.7, 3.3, 4.6, 3.9, 3.5, 4.2, 4. , 4.7, 3.6,
       4.4, 4.5, 4.1, 4.5, 3.9, 4.8, 4. , 4.9, 4.7, 4.3, 4.4, 4.8, 5. ,
       4.5, 3.5, 3.8, 3.7, 3.9, 5.1, 4.5, 4.5, 4.7, 4.4, 4.1, 4. , 4.4,
       4.6, 4. , 3.3, 4.2, 4.2, 4.2, 4.3, 3. , 4.1, 6. , 5.1, 5.9, 5.6,
       5.8, 6.6, 4.5, 6.3, 5.8, 6.1, 5.1, 5.3, 5.5, 5. , 5.1, 5.3, 5.5,
       6.7, 6.9, 5. , 5.7, 4.9, 6.7, 4.9, 5.7, 6. , 4.8, 4.9, 5.6, 5.8,
       6.1, 6.4, 5.6, 5.1, 5.6, 6.1, 5.6, 5.5, 4.8, 5.4, 5.6, 5.1, 5.1,
       5.9, 5.7, 5.2, 5. , 5.2, 5.4, 5.1])

4.5.4. Selecting by Conditions#

When making a new DataFrame using a subset of the columns of an existing DataFrame:

iris2 = iris.loc[iris["species"] == "versicolor"].copy()
# I always recommend you to use .copy() at the end, if you don't know what it means.
iris2.head()

	sepal_length	sepal_width	petal_length	petal_width	species
50	7.0	3.2	4.7	1.4	versicolor
51	6.4	3.2	4.5	1.5	versicolor
52	6.9	3.1	4.9	1.5	versicolor
53	5.5	2.3	4.0	1.3	versicolor
54	6.5	2.8	4.6	1.5	versicolor

iris2 = iris.loc[(iris["petal_length"] > 5) & (iris["species"] == "versicolor")].copy()
# I always recommend you to use .copy() at the end, if you don't know what it means.
iris2.head()

	sepal_length	sepal_width	petal_length	petal_width	species
83	6.0	2.7	5.1	1.6	versicolor

Note that you have to use parentheses like .loc[(condition1) & (condition2)].

4.5.5. Iterate through the DataFrame#

In the example below, I tried to add a column named “test” while iterating through the rows:

for i, row in iris.iterrows():
    if i == 0:
        print(i)
        print(row)
        row["test"] = 999
        print()
        print(row)

0
sepal_length       5.1
sepal_width        3.5
petal_length       1.4
petal_width        0.2
species         setosa
Name: 0, dtype: object

sepal_length       5.1
sepal_width        3.5
petal_length       1.4
petal_width        0.2
species         setosa
test               999
Name: 0, dtype: object

print(iris.iloc[0])
# Note that it is not changed!

sepal_length       5.1
sepal_width        3.5
petal_length       1.4
petal_width        0.2
species         setosa
Name: 0, dtype: object

How do we modify them actually?

A not working example:

# Initialize
iris["test"] = None
for i, row in iris.iterrows():
    row.loc["test"] = row["sepal_length"] + row["petal_length"]

Here, row is just a copy of the original DataFrame. So nothing is changed in iris:

iris.head(1)

	sepal_length	sepal_width	petal_length	petal_width	species	test
0	5.1	3.5	1.4	0.2	setosa	None

A working example:

You need to directly access to the original DataFrame:

# Initialize
iris["test"] = None
for i, row in iris.iterrows():
    iris.at[i, "test"] = row["sepal_length"] + row["petal_length"]

iris.head(1)

	sepal_length	sepal_width	petal_length	petal_width	species	test
0	5.1	3.5	1.4	0.2	setosa	6.5

Things go well in this case with loc, but I used at.

loc is slower, but you can access to multiple locations
at is quicker, but you can access to only one single location

See here

A working and good example for this specific case:

But in this kind of simple summation case, you can just do

iris["test"] = iris["sepal_length"] + iris["petal_length"]
iris.head(1)

	sepal_length	sepal_width	petal_length	petal_width	species	test
0	5.1	3.5	1.4	0.2	setosa	6.5

You don’t even need that initialization.

The iteration is useful when you do some complicated jobs:

# Initialize
iris["test"] = None
for i, row in iris.iterrows():
    length_sum = row["sepal_length"] + row["petal_length"]
    if length_sum > 0.1:
        length_sum = 0.1
    iris.at[i, "test"] = length_sum

4.5.6. Drop#

pandas usually generate annoying index (0, 1, ..., N) columns etc. You can drop them by

iris = iris.drop(columns=["test"])

iris.head(1)

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa

NaN’s can be removed by .dropna():

iris_nan = iris.copy()
iris_nan.iloc[:2] = None
iris_nan.head()

	sepal_length	sepal_width	petal_length	petal_width	species
0	NaN	NaN	NaN	NaN	None
1	NaN	NaN	NaN	NaN	None
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

iris_nan = iris_nan.dropna()
iris_nan.head()  # try .reset_index() by yourself

	sepal_length	sepal_width	petal_length	petal_width	species
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa
5	5.4	3.9	1.7	0.4	setosa
6	4.6	3.4	1.4	0.3	setosa

iris_nan.reset_index().head()

	index	sepal_length	sepal_width	petal_length	petal_width	species
0	2	4.7	3.2	1.3	0.2	setosa
1	3	4.6	3.1	1.5	0.2	setosa
2	4	5.0	3.6	1.4	0.2	setosa
3	5	5.4	3.9	1.7	0.4	setosa
4	6	4.6	3.4	1.4	0.3	setosa

iris_nan.reset_index(drop=True).head()

	sepal_length	sepal_width	petal_length	petal_width	species
0	4.7	3.2	1.3	0.2	setosa
1	4.6	3.1	1.5	0.2	setosa
2	5.0	3.6	1.4	0.2	setosa
3	5.4	3.9	1.7	0.4	setosa
4	4.6	3.4	1.4	0.3	setosa

# iris_nan.to_csv("test.csv", index=False)

4.5.7. Sorting#

iris.sort_values(by=["sepal_width", "petal_length"]).head()

	sepal_length	sepal_width	petal_length	petal_width	species
60	5.0	2.0	3.5	1.0	versicolor
62	6.0	2.2	4.0	1.0	versicolor
68	6.2	2.2	4.5	1.5	versicolor
119	6.0	2.2	5.0	1.5	virginica
41	4.5	2.3	1.3	0.3	setosa

As can be seen, the index numbers are kept but “ordered” based on the column.

Reset it:

iris_sort = (iris.sort_values(by="sepal_width")
             .reset_index(drop=True)
)
iris_sort.head()

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.0	2.0	3.5	1.0	versicolor
1	6.0	2.2	4.0	1.0	versicolor
2	6.0	2.2	5.0	1.5	virginica
3	6.2	2.2	4.5	1.5	versicolor
4	4.5	2.3	1.3	0.3	setosa

See what happens if drop = False.

4.5.8. Grouping#

Grouping is one of the most useful functionality of pandas.

grouped = iris.groupby("species")

print(type(grouped))

<class 'pandas.core.groupby.generic.DataFrameGroupBy'>

grouped.get_group("setosa").head(1)

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa

for name, group in grouped:
    print(name)
    print(group.describe())
    print()

setosa
       sepal_length  sepal_width  petal_length  petal_width
count      50.00000    50.000000     50.000000    50.000000
mean        5.00600     3.428000      1.462000     0.246000
std         0.35249     0.379064      0.173664     0.105386
min         4.30000     2.300000      1.000000     0.100000
25%         4.80000     3.200000      1.400000     0.200000
50%         5.00000     3.400000      1.500000     0.200000
75%         5.20000     3.675000      1.575000     0.300000
max         5.80000     4.400000      1.900000     0.600000

versicolor
       sepal_length  sepal_width  petal_length  petal_width
count     50.000000    50.000000     50.000000    50.000000
mean       5.936000     2.770000      4.260000     1.326000
std        0.516171     0.313798      0.469911     0.197753
min        4.900000     2.000000      3.000000     1.000000
25%        5.600000     2.525000      4.000000     1.200000
50%        5.900000     2.800000      4.350000     1.300000
75%        6.300000     3.000000      4.600000     1.500000
max        7.000000     3.400000      5.100000     1.800000

virginica
       sepal_length  sepal_width  petal_length  petal_width
count      50.00000    50.000000     50.000000     50.00000
mean        6.58800     2.974000      5.552000      2.02600
std         0.63588     0.322497      0.551895      0.27465
min         4.90000     2.200000      4.500000      1.40000
25%         6.22500     2.800000      5.100000      1.80000
50%         6.50000     3.000000      5.550000      2.00000
75%         6.90000     3.175000      5.875000      2.30000
max         7.90000     3.800000      6.900000      2.50000

4.5.9. Masking and Special Operations#

mask = iris["sepal_width"] < 3.3
iris.loc[mask].head()  # Try .reset_index() by yourself

	sepal_length	sepal_width	petal_length	petal_width	species
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
8	4.4	2.9	1.4	0.2	setosa
9	4.9	3.1	1.5	0.1	setosa

There are many of the things you can do for str, e.g., .split or .replace, etc.

You can do that on all the strs in a column without for loop in pandas:

setosa_mask = iris["species"].str.startswith("seto")
iris.loc[setosa_mask].head()

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

iris["species"].str.replace("seto", "testingtesting").head()

  testingtestingsa
  testingtestingsa
  testingtestingsa
  testingtestingsa
  testingtestingsa
Name: species, dtype: object

4.5.10. A Simple Plotting#

fig, axs = plt.subplots(1, 1, figsize=(8, 5), sharex=False, sharey=False, gridspec_kw=None)

for name, g in grouped:
    axs.plot(g["sepal_length"], g["sepal_width"], '.', alpha=0.5, label=name)
    
axs.legend(loc=1)
axs.grid()
axs.set(xlabel="sepal length", ylabel="sepal width")
plt.tight_layout()
plt.show()

../../_images/4163dfbc953741badd7b4e7e1ddec579e89cc3e8c20e1db7f414fd2b3a370d4f.png