6.25. DataFrame Recap
6.25.1. Assignments
# %% About
# - Name: DataFrame Select
# - Difficulty: easy
# - Lines: 5
# - Minutes: 3
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% English
# 1. Load data from `DATA` as `df: pd.DataFrame`
# 2. Select rows where 'petal_length' is above 2.0
# 3. Display first 5 rows
# 4. Do not use `.query()`
# 5. Run doctests - all must succeed
# %% Polish
# 1. Wczytaj dane z `DATA` jako `df: pd.DataFrame`
# 2. Wybierz wiersze, gdzie wartość 'petal_length' jest powyżej 2.0
# 3. Wyświetl 5 pierwszych wierszy
# 4. Nie używaj `.query()`
# 5. Uruchom doctesty - wszystkie muszą się powieść
# %% Expected
# >>> result # doctest: +NORMALIZE_WHITESPACE
# sepal_length sepal_width petal_length petal_width species
# 1 5.9 3.0 5.1 1.8 virginica
# 2 6.0 3.4 4.5 1.6 versicolor
# 3 7.3 2.9 6.3 1.8 virginica
# 4 5.6 2.5 3.9 1.1 versicolor
# 6 5.5 2.6 4.4 1.2 versicolor
# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'
>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'
>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'
>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)
>>> result # doctest: +NORMALIZE_WHITESPACE
sepal_length sepal_width petal_length petal_width species
1 5.9 3.0 5.1 1.8 virginica
2 6.0 3.4 4.5 1.6 versicolor
3 7.3 2.9 6.3 1.8 virginica
4 5.6 2.5 3.9 1.1 versicolor
6 5.5 2.6 4.4 1.2 versicolor
"""
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`
# %% Imports
import pandas as pd
# %% Types
result: pd.DataFrame
# %% Data
DATA = 'https://python3.info/_static/iris-clean.csv'
# %% Result
result = ...
# %% About
# - Name: DataFrame Select
# - Difficulty: easy
# - Lines: 5
# - Minutes: 3
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% English
# 1. Load data from `DATA` as `df: pd.DataFrame`
# 2. Select rows where 'petal_length' is above 2.0
# 3. Display first 5 rows
# 4. Use `.query()`
# 5. Run doctests - all must succeed
# %% Polish
# 1. Wczytaj dane z `DATA` jako `df: pd.DataFrame`
# 2. Wybierz wiersze, gdzie wartość 'petal_length' jest powyżej 2.0
# 3. Wyświetl 5 pierwszych wierszy
# 4. Użyj `.query()`
# 5. Uruchom doctesty - wszystkie muszą się powieść
# %% Expected
# >>> result # doctest: +NORMALIZE_WHITESPACE
# sepal_length sepal_width petal_length petal_width species
# 1 5.9 3.0 5.1 1.8 virginica
# 2 6.0 3.4 4.5 1.6 versicolor
# 3 7.3 2.9 6.3 1.8 virginica
# 4 5.6 2.5 3.9 1.1 versicolor
# 6 5.5 2.6 4.4 1.2 versicolor
# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'
>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'
>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'
>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)
>>> result # doctest: +NORMALIZE_WHITESPACE
sepal_length sepal_width petal_length petal_width species
1 5.9 3.0 5.1 1.8 virginica
2 6.0 3.4 4.5 1.6 versicolor
3 7.3 2.9 6.3 1.8 virginica
4 5.6 2.5 3.9 1.1 versicolor
6 5.5 2.6 4.4 1.2 versicolor
"""
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`
# %% Imports
import pandas as pd
# %% Types
result: pd.DataFrame
# %% Data
DATA = 'https://python3.info/_static/iris-clean.csv'
# %% Result
result = ...
# %% About
# - Name: DataFrame Alter Categorize
# - Difficulty: medium
# - Lines: 8
# - Minutes: 8
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% English
# 1. Modify `df: pd.DataFrame` (cars dataset)
# 2. Add column `status` with values:
# - `new` if `mileage` from 0 to 10_000 km
# - `young` if `mileage` from 10_000 km to 100_000 km
# - `old` if `mileage` above 100_000 km
# 3. All ranges includes lower bounds and exclude upper bounds
# 4. Do not use `pd.cut()` or `pd.select()`
# 5. Run doctests - all must succeed
# %% Polish
# 1. Zmodyfikuj `df: pd.DataFrame` (zestaw danych o samochodach)
# 2. Dodaj kolumnę `status` o wartościach:
# - `new` jeżeli `mileage` od 0 do 10_000 km
# - `young` jeżeli `mileage` od 10_000 km do 100_000 km
# - `old` jeżeli `mileage` powyżej 100_000 km
# 3. Wszystkie przedziały włączają dolny zakres i wyłączają górny zakres
# 4. Nie używaj `pd.cut()` ani `pd.select()`
# 5. Uruchom doctesty - wszystkie muszą się powieść
# %% Expected
# >>> result
# mileage consumption status
# 0 199340 2 old
# 1 43567 0 young
# 2 173685 0 old
# 3 117952 4 old
# 4 176963 5 old
# 5 152315 6 old
# 6 95939 8 young
# 7 97639 20 young
# 8 41993 17 young
# 9 122579 15 old
# 10 86293 4 young
# 11 186098 9 old
# 12 112420 10 old
# 13 48600 1 young
# 14 170584 1 old
# 15 52620 7 young
# 16 80186 9 young
# 17 17089 3 young
# 18 163302 6 old
# 19 150055 11 old
# 20 108631 14 old
# 21 101201 18 old
# 22 82457 0 young
# 23 137993 14 old
# 24 169876 3 old
# 25 67699 12 young
# 26 70608 10 young
# 27 168691 20 old
# 28 7877 11 new
# 29 83966 4 young
# 30 132943 6 old
# 31 73135 4 young
# 32 133568 15 old
# 33 179026 20 old
# 34 155747 3 old
# 35 108504 12 old
# 36 31921 4 young
# 37 131869 20 old
# 38 49811 8 young
# 39 199827 14 old
# 40 122144 15 old
# 41 183561 20 old
# 42 84665 3 young
# 43 119423 15 old
# 44 41504 13 young
# 45 49866 16 young
# 46 123031 17 old
# 47 125603 5 old
# 48 11723 9 young
# 49 174962 3 old
# %% Hints
# - `pd.NA`
# - `DataFrame.loc[query, column] = value`
# - `DataFrame.between()`
# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'
>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'
>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'
>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)
>>> result # doctest: +NORMALIZE_WHITESPACE
mileage consumption status
0 199340 2 old
1 43567 0 young
2 173685 0 old
3 117952 4 old
4 176963 5 old
5 152315 6 old
6 95939 8 young
7 97639 20 young
8 41993 17 young
9 122579 15 old
10 86293 4 young
11 186098 9 old
12 112420 10 old
13 48600 1 young
14 170584 1 old
15 52620 7 young
16 80186 9 young
17 17089 3 young
18 163302 6 old
19 150055 11 old
20 108631 14 old
21 101201 18 old
22 82457 0 young
23 137993 14 old
24 169876 3 old
25 67699 12 young
26 70608 10 young
27 168691 20 old
28 7877 11 new
29 83966 4 young
30 132943 6 old
31 73135 4 young
32 133568 15 old
33 179026 20 old
34 155747 3 old
35 108504 12 old
36 31921 4 young
37 131869 20 old
38 49811 8 young
39 199827 14 old
40 122144 15 old
41 183561 20 old
42 84665 3 young
43 119423 15 old
44 41504 13 young
45 49866 16 young
46 123031 17 old
47 125603 5 old
48 11723 9 young
49 174962 3 old
"""
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`
# %% Imports
import pandas as pd
import numpy as np
# %% Types
result: pd.DataFrame
# %% Data
np.random.seed(0)
df = pd.DataFrame({
'mileage': np.random.randint(0, 200_000, size=50),
'consumption': np.random.randint(0, 21, size=50),
})
# %% Result
result = ...
# %% About
# - Name: DataFrame Alter Cut
# - Difficulty: medium
# - Lines: 15
# - Minutes: 21
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% English
# 1. Modify `df: pd.DataFrame` (cars dataset)
# 2. Using `pd.cut` add column `type`:
# - if `consumption` from 0 to 1 then `type` is `electric`
# - if `consumption` from 1 to 10 then `type` is `car`
# - if `consumption` from 10 to 100 then `type` is `truck`
# 3. All ranges includes lower bounds and exclude upper bounds
# 4. Use `pd.cut()` function
# 5. Run doctests - all must succeed
# %% Polish
# 1. Zmodyfikuj `df: pd.DataFrame` (zestaw danych o samochodach)
# 2. Używając `pd.cut` dodaj kolumnę `type`:
# - jeżeli `consumption` od 0 do 1 to `type` jest `electric`
# - jeżeli `consumption` od 1 do 10 to `type` jest `car`
# - jeżeli `consumption` od 10 do 100 to `type` jest `truck`
# 3. Wszystkie przedziały włączają dolny zakres i wyłączają górny zakres
# 4. Użyj funkcji `pd.cut()`
# 5. Uruchom doctesty - wszystkie muszą się powieść
# %% Expected
# >>> result
# mileage consumption type
# 0 199340 2 car
# 1 43567 0 electric
# 2 173685 0 electric
# 3 117952 4 car
# 4 176963 5 car
# 5 152315 6 car
# 6 95939 8 car
# 7 97639 20 truck
# 8 41993 17 truck
# 9 122579 15 truck
# 10 86293 4 car
# 11 186098 9 car
# 12 112420 10 car
# 13 48600 1 electric
# 14 170584 1 electric
# 15 52620 7 car
# 16 80186 9 car
# 17 17089 3 car
# 18 163302 6 car
# 19 150055 11 truck
# 20 108631 14 truck
# 21 101201 18 truck
# 22 82457 0 electric
# 23 137993 14 truck
# 24 169876 3 car
# 25 67699 12 truck
# 26 70608 10 car
# 27 168691 20 truck
# 28 7877 11 truck
# 29 83966 4 car
# 30 132943 6 car
# 31 73135 4 car
# 32 133568 15 truck
# 33 179026 20 truck
# 34 155747 3 car
# 35 108504 12 truck
# 36 31921 4 car
# 37 131869 20 truck
# 38 49811 8 car
# 39 199827 14 truck
# 40 122144 15 truck
# 41 183561 20 truck
# 42 84665 3 car
# 43 119423 15 truck
# 44 41504 13 truck
# 45 49866 16 truck
# 46 123031 17 truck
# 47 125603 5 car
# 48 11723 9 car
# 49 174962 3 car
# %% Hints
# - `pd.DataFrame()`
# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'
>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'
>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'
>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)
>>> result
mileage consumption type
0 199340 2 car
1 43567 0 electric
2 173685 0 electric
3 117952 4 car
4 176963 5 car
5 152315 6 car
6 95939 8 car
7 97639 20 truck
8 41993 17 truck
9 122579 15 truck
10 86293 4 car
11 186098 9 car
12 112420 10 car
13 48600 1 electric
14 170584 1 electric
15 52620 7 car
16 80186 9 car
17 17089 3 car
18 163302 6 car
19 150055 11 truck
20 108631 14 truck
21 101201 18 truck
22 82457 0 electric
23 137993 14 truck
24 169876 3 car
25 67699 12 truck
26 70608 10 car
27 168691 20 truck
28 7877 11 truck
29 83966 4 car
30 132943 6 car
31 73135 4 car
32 133568 15 truck
33 179026 20 truck
34 155747 3 car
35 108504 12 truck
36 31921 4 car
37 131869 20 truck
38 49811 8 car
39 199827 14 truck
40 122144 15 truck
41 183561 20 truck
42 84665 3 car
43 119423 15 truck
44 41504 13 truck
45 49866 16 truck
46 123031 17 truck
47 125603 5 car
48 11723 9 car
49 174962 3 car
"""
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`
# %% Imports
import pandas as pd
import numpy as np
# %% Types
result: pd.DataFrame
# %% Data
np.random.seed(0)
df = pd.DataFrame({
'mileage': np.random.randint(0, 200_000, size=50),
'consumption': np.random.randint(0, 21, size=50),
})
# %% Result
result = ...
# %% About
# - Name: DataFrame NaN
# - Difficulty: easy
# - Lines: 10
# - Minutes: 8
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% English
# 1. Read data from `DATA` as `df: pd.DataFrame`
# 2. Skip first line with metadata
# 3. Rename columns to:
# - sepal_length
# - sepal_width
# - petal_length
# - petal_width
# - species
# 4. Replace values in column species
# - 0 -> 'setosa',
# - 1 -> 'versicolor',
# - 2 -> 'virginica'
# 5. Select values in column 'petal_length' less than 4
# 6. Set selected values to `NaN`
# 7. Drop rows with remaining `NaN` values
# 8. Define `result` as first two rows
# 9. Run doctests - all must succeed
# %% Polish
# 1. Wczytaj dane z `DATA` jako `df: pd.DataFrame`
# 2. Pomiń pierwszą linię z metadanymi
# 3. Zmień nazwy kolumn na:
# - sepal_length
# - sepal_width
# - petal_length
# - petal_width
# - species
# 4. Podmień wartości w kolumnie species
# - 0 -> 'setosa',
# - 1 -> 'versicolor',
# - 2 -> 'virginica'
# 5. Wybierz wartości w kolumnie 'petal_length' mniejsze od 4
# 6. Wybrane wartości ustaw na `NaN`
# 7. Usuń wiersze z pozostałymi wartościami `NaN`
# 8. Zdefiniuj `result` jako dwa pierwsze wiersze
# 9. Uruchom doctesty - wszystkie muszą się powieść
# %% Expected
# >>> result # doctest: +NORMALIZE_WHITESPACE
# sepal_length sepal_width petal_length petal_width species
# 1 5.9 3.0 5.1 1.8 virginica
# 2 6.0 3.4 4.5 1.6 versicolor
# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'
>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'
>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'
>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)
>>> result # doctest: +NORMALIZE_WHITESPACE
sepal_length sepal_width petal_length petal_width species
1 5.9 3.0 5.1 1.8 virginica
2 6.0 3.4 4.5 1.6 versicolor
"""
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`
# %% Imports
import pandas as pd
# %% Types
result: pd.DataFrame
# %% Data
DATA = 'https://python3.info/_static/iris-dirty.csv'
COLUMNS = [
'sepal_length',
'sepal_width',
'petal_length',
'petal_width',
'species']
LABELS = {
0: 'setosa',
1: 'versicolor',
2: 'virginica',
}
# %% Result
result = ...
# %% About
# - Name: DataFrame Statistics
# - Difficulty: medium
# - Lines: 1
# - Minutes: 2
# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author
# %% English
# 1. Save basic descriptive statistics to `result: pd.DataFrame`
# 2. Run doctests - all must succeed
# %% Polish
# 1. Zapisz podstawowe statystyki opisowe do `result: pd.DataFrame`
# 2. Uruchom doctesty - wszystkie muszą się powieść
# %% Expected
# >>> result
# mileage consumption
# count 50.0000 50.0000
# mean 110421.0200 9.3200
# std 53170.2433 6.2448
# min 7877.0000 0.0000
# 25% 71239.7500 4.0000
# 50% 115186.0000 9.0000
# 75% 154889.0000 14.7500
# max 199827.0000 20.0000
# %% Hints
# - `DataFrame.describe()`
# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0
>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'
>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'
>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'
>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'
>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)
>>> result # doctest: +NORMALIZE_WHITESPACE
mileage consumption
count 50.0000 50.0000
mean 110421.0200 9.3200
std 53170.2433 6.2448
min 7877.0000 0.0000
25% 71239.7500 4.0000
50% 115186.0000 9.0000
75% 154889.0000 14.7500
max 199827.0000 20.0000
"""
# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`
# %% Imports
import pandas as pd
import numpy as np
# %% Types
result: pd.DataFrame
# %% Data
np.random.seed(0)
df = pd.DataFrame({
'mileage': np.random.randint(0, 200_000, size=50),
'consumption': np.random.randint(0, 21, size=50),
})
# %% Result
result = ...