6.25. DataFrame Recap

6.25.1. Assignments

# %% About
# - Name: DataFrame Select
# - Difficulty: easy
# - Lines: 5
# - Minutes: 3

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% English
# 1. Load data from `DATA` as `df: pd.DataFrame`
# 2. Select rows where 'petal_length' is above 2.0
# 3. Display first 5 rows
# 4. Do not use `.query()`
# 5. Run doctests - all must succeed

# %% Polish
# 1. Wczytaj dane z `DATA` jako `df: pd.DataFrame`
# 2. Wybierz wiersze, gdzie wartość 'petal_length' jest powyżej 2.0
# 3. Wyświetl 5 pierwszych wierszy
# 4. Nie używaj `.query()`
# 5. Uruchom doctesty - wszystkie muszą się powieść

# %% Expected
# >>> result  # doctest: +NORMALIZE_WHITESPACE
#    sepal_length  sepal_width  petal_length  petal_width     species
# 1           5.9          3.0           5.1          1.8   virginica
# 2           6.0          3.4           4.5          1.6  versicolor
# 3           7.3          2.9           6.3          1.8   virginica
# 4           5.6          2.5           3.9          1.1  versicolor
# 6           5.5          2.6           4.4          1.2  versicolor

# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0

>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'

>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'

>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'

>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'

>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)

>>> result  # doctest: +NORMALIZE_WHITESPACE
   sepal_length  sepal_width  petal_length  petal_width     species
1           5.9          3.0           5.1          1.8   virginica
2           6.0          3.4           4.5          1.6  versicolor
3           7.3          2.9           6.3          1.8   virginica
4           5.6          2.5           3.9          1.1  versicolor
6           5.5          2.6           4.4          1.2  versicolor
"""

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`

# %% Imports
import pandas as pd

# %% Types
result: pd.DataFrame

# %% Data
DATA = 'https://python3.info/_static/iris-clean.csv'

# %% Result
result = ...

# %% About
# - Name: DataFrame Select
# - Difficulty: easy
# - Lines: 5
# - Minutes: 3

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% English
# 1. Load data from `DATA` as `df: pd.DataFrame`
# 2. Select rows where 'petal_length' is above 2.0
# 3. Display first 5 rows
# 4. Use `.query()`
# 5. Run doctests - all must succeed

# %% Polish
# 1. Wczytaj dane z `DATA` jako `df: pd.DataFrame`
# 2. Wybierz wiersze, gdzie wartość 'petal_length' jest powyżej 2.0
# 3. Wyświetl 5 pierwszych wierszy
# 4. Użyj `.query()`
# 5. Uruchom doctesty - wszystkie muszą się powieść

# %% Expected
# >>> result  # doctest: +NORMALIZE_WHITESPACE
#    sepal_length  sepal_width  petal_length  petal_width     species
# 1           5.9          3.0           5.1          1.8   virginica
# 2           6.0          3.4           4.5          1.6  versicolor
# 3           7.3          2.9           6.3          1.8   virginica
# 4           5.6          2.5           3.9          1.1  versicolor
# 6           5.5          2.6           4.4          1.2  versicolor

# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0

>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'

>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'

>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'

>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'

>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)

>>> result  # doctest: +NORMALIZE_WHITESPACE
   sepal_length  sepal_width  petal_length  petal_width     species
1           5.9          3.0           5.1          1.8   virginica
2           6.0          3.4           4.5          1.6  versicolor
3           7.3          2.9           6.3          1.8   virginica
4           5.6          2.5           3.9          1.1  versicolor
6           5.5          2.6           4.4          1.2  versicolor
"""

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`

# %% Imports
import pandas as pd

# %% Types
result: pd.DataFrame

# %% Data
DATA = 'https://python3.info/_static/iris-clean.csv'

# %% Result
result = ...

# %% About
# - Name: DataFrame Alter Categorize
# - Difficulty: medium
# - Lines: 8
# - Minutes: 8

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% English
# 1. Modify `df: pd.DataFrame` (cars dataset)
# 2. Add column `status` with values:
#    - `new` if `mileage` from 0 to 10_000 km
#    - `young` if `mileage` from 10_000 km to 100_000 km
#    - `old` if `mileage` above 100_000 km
# 3. All ranges includes lower bounds and exclude upper bounds
# 4. Do not use `pd.cut()` or `pd.select()`
# 5. Run doctests - all must succeed

# %% Polish
# 1. Zmodyfikuj `df: pd.DataFrame` (zestaw danych o samochodach)
# 2. Dodaj kolumnę `status` o wartościach:
#    - `new` jeżeli `mileage` od 0 do 10_000 km
#    - `young` jeżeli `mileage` od 10_000 km do 100_000 km
#    - `old` jeżeli `mileage` powyżej 100_000 km
# 3. Wszystkie przedziały włączają dolny zakres i wyłączają górny zakres
# 4. Nie używaj `pd.cut()` ani `pd.select()`
# 5. Uruchom doctesty - wszystkie muszą się powieść

# %% Expected
# >>> result
#     mileage  consumption status
# 0    199340            2    old
# 1     43567            0  young
# 2    173685            0    old
# 3    117952            4    old
# 4    176963            5    old
# 5    152315            6    old
# 6     95939            8  young
# 7     97639           20  young
# 8     41993           17  young
# 9    122579           15    old
# 10    86293            4  young
# 11   186098            9    old
# 12   112420           10    old
# 13    48600            1  young
# 14   170584            1    old
# 15    52620            7  young
# 16    80186            9  young
# 17    17089            3  young
# 18   163302            6    old
# 19   150055           11    old
# 20   108631           14    old
# 21   101201           18    old
# 22    82457            0  young
# 23   137993           14    old
# 24   169876            3    old
# 25    67699           12  young
# 26    70608           10  young
# 27   168691           20    old
# 28     7877           11    new
# 29    83966            4  young
# 30   132943            6    old
# 31    73135            4  young
# 32   133568           15    old
# 33   179026           20    old
# 34   155747            3    old
# 35   108504           12    old
# 36    31921            4  young
# 37   131869           20    old
# 38    49811            8  young
# 39   199827           14    old
# 40   122144           15    old
# 41   183561           20    old
# 42    84665            3  young
# 43   119423           15    old
# 44    41504           13  young
# 45    49866           16  young
# 46   123031           17    old
# 47   125603            5    old
# 48    11723            9  young
# 49   174962            3    old

# %% Hints
# - `pd.NA`
# - `DataFrame.loc[query, column] = value`
# - `DataFrame.between()`

# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0

>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'

>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'

>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'

>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'

>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)

>>> result  # doctest: +NORMALIZE_WHITESPACE
    mileage  consumption status
0    199340            2    old
1     43567            0  young
2    173685            0    old
3    117952            4    old
4    176963            5    old
5    152315            6    old
6     95939            8  young
7     97639           20  young
8     41993           17  young
9    122579           15    old
10    86293            4  young
11   186098            9    old
12   112420           10    old
13    48600            1  young
14   170584            1    old
15    52620            7  young
16    80186            9  young
17    17089            3  young
18   163302            6    old
19   150055           11    old
20   108631           14    old
21   101201           18    old
22    82457            0  young
23   137993           14    old
24   169876            3    old
25    67699           12  young
26    70608           10  young
27   168691           20    old
28     7877           11    new
29    83966            4  young
30   132943            6    old
31    73135            4  young
32   133568           15    old
33   179026           20    old
34   155747            3    old
35   108504           12    old
36    31921            4  young
37   131869           20    old
38    49811            8  young
39   199827           14    old
40   122144           15    old
41   183561           20    old
42    84665            3  young
43   119423           15    old
44    41504           13  young
45    49866           16  young
46   123031           17    old
47   125603            5    old
48    11723            9  young
49   174962            3    old
"""

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`

# %% Imports
import pandas as pd
import numpy as np

# %% Types
result: pd.DataFrame

# %% Data
np.random.seed(0)

df = pd.DataFrame({
    'mileage': np.random.randint(0, 200_000, size=50),
    'consumption': np.random.randint(0, 21, size=50),
})

# %% Result
result = ...

# %% About
# - Name: DataFrame Alter Cut
# - Difficulty: medium
# - Lines: 15
# - Minutes: 21

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% English
# 1. Modify `df: pd.DataFrame` (cars dataset)
# 2. Using `pd.cut` add column `type`:
#    - if `consumption` from 0 to 1 then `type` is `electric`
#    - if `consumption` from 1 to 10 then `type` is `car`
#    - if `consumption` from 10 to 100 then `type` is `truck`
# 3. All ranges includes lower bounds and exclude upper bounds
# 4. Use `pd.cut()` function
# 5. Run doctests - all must succeed

# %% Polish
# 1. Zmodyfikuj `df: pd.DataFrame` (zestaw danych o samochodach)
# 2. Używając `pd.cut` dodaj kolumnę `type`:
#    - jeżeli `consumption` od 0 do 1 to `type` jest `electric`
#    - jeżeli `consumption` od 1 do 10 to `type` jest `car`
#    - jeżeli `consumption` od 10 do 100 to `type` jest `truck`
# 3. Wszystkie przedziały włączają dolny zakres i wyłączają górny zakres
# 4. Użyj funkcji `pd.cut()`
# 5. Uruchom doctesty - wszystkie muszą się powieść

# %% Expected
# >>> result
#     mileage  consumption      type
# 0    199340            2       car
# 1     43567            0  electric
# 2    173685            0  electric
# 3    117952            4       car
# 4    176963            5       car
# 5    152315            6       car
# 6     95939            8       car
# 7     97639           20     truck
# 8     41993           17     truck
# 9    122579           15     truck
# 10    86293            4       car
# 11   186098            9       car
# 12   112420           10       car
# 13    48600            1  electric
# 14   170584            1  electric
# 15    52620            7       car
# 16    80186            9       car
# 17    17089            3       car
# 18   163302            6       car
# 19   150055           11     truck
# 20   108631           14     truck
# 21   101201           18     truck
# 22    82457            0  electric
# 23   137993           14     truck
# 24   169876            3       car
# 25    67699           12     truck
# 26    70608           10       car
# 27   168691           20     truck
# 28     7877           11     truck
# 29    83966            4       car
# 30   132943            6       car
# 31    73135            4       car
# 32   133568           15     truck
# 33   179026           20     truck
# 34   155747            3       car
# 35   108504           12     truck
# 36    31921            4       car
# 37   131869           20     truck
# 38    49811            8       car
# 39   199827           14     truck
# 40   122144           15     truck
# 41   183561           20     truck
# 42    84665            3       car
# 43   119423           15     truck
# 44    41504           13     truck
# 45    49866           16     truck
# 46   123031           17     truck
# 47   125603            5       car
# 48    11723            9       car
# 49   174962            3       car

# %% Hints
# - `pd.DataFrame()`

# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0

>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'

>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'

>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'

>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'

>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)

>>> result
    mileage  consumption      type
0    199340            2       car
1     43567            0  electric
2    173685            0  electric
3    117952            4       car
4    176963            5       car
5    152315            6       car
6     95939            8       car
7     97639           20     truck
8     41993           17     truck
9    122579           15     truck
10    86293            4       car
11   186098            9       car
12   112420           10       car
13    48600            1  electric
14   170584            1  electric
15    52620            7       car
16    80186            9       car
17    17089            3       car
18   163302            6       car
19   150055           11     truck
20   108631           14     truck
21   101201           18     truck
22    82457            0  electric
23   137993           14     truck
24   169876            3       car
25    67699           12     truck
26    70608           10       car
27   168691           20     truck
28     7877           11     truck
29    83966            4       car
30   132943            6       car
31    73135            4       car
32   133568           15     truck
33   179026           20     truck
34   155747            3       car
35   108504           12     truck
36    31921            4       car
37   131869           20     truck
38    49811            8       car
39   199827           14     truck
40   122144           15     truck
41   183561           20     truck
42    84665            3       car
43   119423           15     truck
44    41504           13     truck
45    49866           16     truck
46   123031           17     truck
47   125603            5       car
48    11723            9       car
49   174962            3       car
"""

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`

# %% Imports
import pandas as pd
import numpy as np

# %% Types
result: pd.DataFrame

# %% Data
np.random.seed(0)

df = pd.DataFrame({
    'mileage': np.random.randint(0, 200_000, size=50),
    'consumption': np.random.randint(0, 21, size=50),
})

# %% Result
result = ...

# %% About
# - Name: DataFrame NaN
# - Difficulty: easy
# - Lines: 10
# - Minutes: 8

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% English
# 1. Read data from `DATA` as `df: pd.DataFrame`
# 2. Skip first line with metadata
# 3. Rename columns to:
#    - sepal_length
#    - sepal_width
#    - petal_length
#    - petal_width
#    - species
# 4. Replace values in column species
#    - 0 -> 'setosa',
#    - 1 -> 'versicolor',
#    - 2 -> 'virginica'
# 5. Select values in column 'petal_length' less than 4
# 6. Set selected values to `NaN`
# 7. Drop rows with remaining `NaN` values
# 8. Define `result` as first two rows
# 9. Run doctests - all must succeed

# %% Polish
# 1. Wczytaj dane z `DATA` jako `df: pd.DataFrame`
# 2. Pomiń pierwszą linię z metadanymi
# 3. Zmień nazwy kolumn na:
#    - sepal_length
#    - sepal_width
#    - petal_length
#    - petal_width
#    - species
# 4. Podmień wartości w kolumnie species
#    - 0 -> 'setosa',
#    - 1 -> 'versicolor',
#    - 2 -> 'virginica'
# 5. Wybierz wartości w kolumnie 'petal_length' mniejsze od 4
# 6. Wybrane wartości ustaw na `NaN`
# 7. Usuń wiersze z pozostałymi wartościami `NaN`
# 8. Zdefiniuj `result` jako dwa pierwsze wiersze
# 9. Uruchom doctesty - wszystkie muszą się powieść

# %% Expected
# >>> result  # doctest: +NORMALIZE_WHITESPACE
#    sepal_length  sepal_width  petal_length  petal_width     species
# 1           5.9          3.0           5.1          1.8   virginica
# 2           6.0          3.4           4.5          1.6  versicolor

# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0

>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'

>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'

>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'

>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'

>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)

>>> result  # doctest: +NORMALIZE_WHITESPACE
   sepal_length  sepal_width  petal_length  petal_width     species
1           5.9          3.0           5.1          1.8   virginica
2           6.0          3.4           4.5          1.6  versicolor
"""

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`

# %% Imports
import pandas as pd

# %% Types
result: pd.DataFrame

# %% Data
DATA = 'https://python3.info/_static/iris-dirty.csv'

COLUMNS = [
    'sepal_length',
    'sepal_width',
    'petal_length',
    'petal_width',
    'species']

LABELS = {
    0: 'setosa',
    1: 'versicolor',
    2: 'virginica',
}

# %% Result
result = ...

# %% About
# - Name: DataFrame Statistics
# - Difficulty: medium
# - Lines: 1
# - Minutes: 2

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% English
# 1. Save basic descriptive statistics to `result: pd.DataFrame`
# 2. Run doctests - all must succeed

# %% Polish
# 1. Zapisz podstawowe statystyki opisowe do `result: pd.DataFrame`
# 2. Uruchom doctesty - wszystkie muszą się powieść

# %% Expected
# >>> result
#            mileage  consumption
# count      50.0000      50.0000
# mean   110421.0200       9.3200
# std     53170.2433       6.2448
# min      7877.0000       0.0000
# 25%     71239.7500       4.0000
# 50%    115186.0000       9.0000
# 75%    154889.0000      14.7500
# max    199827.0000      20.0000

# %% Hints
# - `DataFrame.describe()`

# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0

>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'

>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'

>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'

>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'

>>> pd.set_option('display.max_columns', 50)
>>> pd.set_option('display.max_rows', 200)
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.memory_usage', 'deep')
>>> pd.set_option('display.precision', 4)

>>> result  # doctest: +NORMALIZE_WHITESPACE
           mileage  consumption
count      50.0000      50.0000
mean   110421.0200       9.3200
std     53170.2433       6.2448
min      7877.0000       0.0000
25%     71239.7500       4.0000
50%    115186.0000       9.0000
75%    154889.0000      14.7500
max    199827.0000      20.0000
"""

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`

# %% Imports
import pandas as pd
import numpy as np

# %% Types
result: pd.DataFrame

# %% Data
np.random.seed(0)

df = pd.DataFrame({
    'mileage': np.random.randint(0, 200_000, size=50),
    'consumption': np.random.randint(0, 21, size=50),
})

# %% Result
result = ...