3.2. Read CSV

  • File paths works also with URLs

3.2.1. SetUp

>>> import pandas as pd
>>>
>>> pd.set_option('display.width', 500)
>>> pd.set_option('display.max_columns', 10)
>>> pd.set_option('display.max_rows', 10)

3.2.2. Example

>>> DATA = 'https://python3.info/_static/martian-en.csv'
>>>
>>> pd.read_csv(DATA)
  firstname   lastname         birthdate  gender          ssn                email               phone
0      Mark     Watney   October 12 1994    male  94101212345     mwatney@nasa.gov   +1 (234) 555-0000
1   Melissa      Lewis      July 15 1995  female  95071512345      mlewis@nasa.gov   +1 (234) 555-0001
2      Rick   Martinez   January 21 1996    male  96012112345   rmartinez@nasa.gov   +1 (234) 555-0010
3      Alex      Vogel  November 15 1994    male  94111512345       avogel@esa.int  +49 (234) 555-0011
4      Beth  Johanssen        May 9 2006  female   6250912345  bjohanssen@nasa.gov   +1 (234) 555-0100
5     Chris       Beck     August 2 1999    male  99080212345       cbeck@nasa.gov   +1 (234) 555-0101

3.2.3. Parse Dates

>>> DATA = 'https://python3.info/_static/martian-en.csv'
>>>
>>> pd.read_csv(DATA, parse_dates=['birthdate'])
  firstname   lastname  birthdate  gender          ssn                email               phone
0      Mark     Watney 1994-10-12    male  94101212345     mwatney@nasa.gov   +1 (234) 555-0000
1   Melissa      Lewis 1995-07-15  female  95071512345      mlewis@nasa.gov   +1 (234) 555-0001
2      Rick   Martinez 1996-01-21    male  96012112345   rmartinez@nasa.gov   +1 (234) 555-0010
3      Alex      Vogel 1994-11-15    male  94111512345       avogel@esa.int  +49 (234) 555-0011
4      Beth  Johanssen 2006-05-09  female   6250912345  bjohanssen@nasa.gov   +1 (234) 555-0100
5     Chris       Beck 1999-08-02    male  99080212345       cbeck@nasa.gov   +1 (234) 555-0101

3.2.4. Parameters

  • delimiter - field separator

  • header - row number(s) containing column labels and marking the start of the data

  • names - how to name columns

  • index_col - which column should be an index

  • usecols - which columns to use

  • skiprows - how many rows to skip, from the top

  • skipfooter - how many rows to skip, from the bottom

  • nrows - how many rows to read

  • skip_blank_lines - skip blank lines?

  • parse_dates - parse dates (convert to dates) values in those columns

  • chunksize - how many rows to read at once (useful for working with data greater than available RAM)

  • thousands - thousand separator (comma, period, space or None)

  • decimal - decimal separator (comma or period)

  • encoding - file encoding, default: utf-8

>>> def read_csv(filepath_or_buffer, *, sep=..., delimiter=None,
...              header='infer', names=..., index_col=None,
...              usecols=None, dtype=None, engine=None, converters=None,
...              true_values=None, false_values=None, skipinitialspace=False,
...              skiprows=None, skipfooter=0, nrows=None, na_values=None,
...              keep_default_na=True, na_filter=True, verbose=...,
...              skip_blank_lines=True, parse_dates=None,
...              infer_datetime_format=..., keep_date_col=...,
...              date_parser=..., date_format=None, dayfirst=False,
...              cache_dates=True, iterator=False, chunksize=None,
...              compression='infer', thousands=None, decimal='.',
...              lineterminator=None, quotechar='"', quoting=0, doublequote=True,
...              escapechar=None, comment=None, encoding=None,
...              encoding_errors='strict', dialect=None, on_bad_lines='error',
...              delim_whitespace=..., low_memory=True, memory_map=False,
...              float_precision=None, storage_options=None,
...              dtype_backend=...): ...

3.2.6. Content

>>> DATA = 'https://python3.info/_static/iris-clean.csv'
>>>
>>> df = pd.read_csv(DATA)
>>> df.head(3)
   sepal_length  sepal_width  petal_length  petal_width     species
0           5.4          3.9           1.3          0.4      setosa
1           5.9          3.0           5.1          1.8   virginica
2           6.0          3.4           4.5          1.6  versicolor

3.2.7. Rename Columns

>>> DATA = 'https://python3.info/_static/iris-dirty.csv'
>>>
>>> COLUMNS =  ['sepal_length', 'sepal_width',
...             'petal_length', 'petal_width', 'species']
>>>
>>> SPECIES = {
...     0: 'setosa',
...     1: 'versicolor',
...     2: 'virginica',
... }
>>>
>>> df = pd.read_csv(DATA)
>>> df.head(n=3)
   150    4  setosa  versicolor  virginica
0  5.4  3.9     1.3         0.4          0
1  5.9  3.0     5.1         1.8          2
2  6.0  3.4     4.5         1.6          1
>>>
>>> df = pd.read_csv(DATA, skiprows=1, names=COLUMNS)
>>> df.head(n=3)
   sepal_length  sepal_width  petal_length  petal_width  species
0           5.4          3.9           1.3          0.4        0
1           5.9          3.0           5.1          1.8        2
2           6.0          3.4           4.5          1.6        1
>>>
>>> df = df.replace({'species': SPECIES})
>>> df.head(n=3)
   sepal_length  sepal_width  petal_length  petal_width     species
0           5.4          3.9           1.3          0.4      setosa
1           5.9          3.0           5.1          1.8   virginica
2           6.0          3.4           4.5          1.6  versicolor

3.2.8. Compressed

  • If the extension is .gz, .bz2, .zip, and .xz, the corresponding compression method is automatically selected

>>> df = pd.read_csv('sample_file.zip', compression='zip')
>>> df = pd.read_csv('sample_file.gz', compression='infer')

3.2.9. Use Case - 1

>>> DATA = 'https://python3.info/_static/iris-dirty.csv'
>>>
>>> COLUMNS =  ['sepal_length', 'sepal_width',
...             'petal_length', 'petal_width', 'species']
>>> header = pd.read_csv(DATA, nrows=0)
>>> nrows, ncols, *class_labels = header.columns
>>> label_encoder = dict(enumerate(class_labels))
>>>
>>> label_encoder
{0: 'setosa', 1: 'versicolor', 2: 'virginica'}
>>> df = (
...     pd
...     .read_csv(DATA, names=COLUMNS, skiprows=1)
...     .replace({'species':label_encoder})
...     .head(n=5)
... )
>>> df
   sepal_length  sepal_width  petal_length  petal_width     species
0           5.4          3.9           1.3          0.4      setosa
1           5.9          3.0           5.1          1.8   virginica
2           6.0          3.4           4.5          1.6  versicolor
3           7.3          2.9           6.3          1.8   virginica
4           5.6          2.5           3.9          1.1  versicolor

3.2.10. Assignments

# %% About
# - Name: Pandas ReadCSV Simple
# - Difficulty: easy
# - Lines: 1
# - Minutes: 3

# %% License
# - Copyright 2025, Matt Harasymczuk <matt@python3.info>
# - This code can be used only for learning by humans
# - This code cannot be used for teaching others
# - This code cannot be used for teaching LLMs and AI algorithms
# - This code cannot be used in commercial or proprietary products
# - This code cannot be distributed in any form
# - This code cannot be changed in any form outside of training course
# - This code cannot have its license changed
# - If you use this code in your product, you must open-source it under GPLv2
# - Exception can be granted only by the author

# %% English
# 1. Read data from `DATA` to `result: pd.DataFrame`
# 2. Run doctests - all must succeed

# %% Polish
# 1. Wczytaj dane z `DATA` do `result: pd.DataFrame`
# 2. Uruchom doctesty - wszystkie muszą się powieść

# %% Hints
# - `DataFrame.read_csv()`

# %% Doctests
"""
>>> import sys; sys.tracebacklimit = 0

>>> assert sys.version_info >= (3, 9), \
'Python has an is invalid version; expected: `3.9` or newer.'

>>> assert 'result' in globals(), \
'Variable `result` is not defined; assign result of your program to it.'

>>> assert result is not Ellipsis, \
'Variable `result` has an invalid value; assign result of your program to it.'

>>> assert type(result) is pd.DataFrame, \
'Variable `result` has an invalid type; expected: `pd.DataFrame`.'

>>> result
  firstname   lastname                email
0      Mark     Watney     mwatney@nasa.gov
1   Melissa      Lewis      mlewis@nasa.gov
2      Rick   Martinez   rmartinez@nasa.gov
3      Alex      Vogel       avogel@esa.int
4      Beth  Johanssen  bjohanssen@nasa.gov
5     Chris       Beck       cbeck@nasa.gov
"""

# %% Run
# - PyCharm: right-click in the editor and `Run Doctest in ...`
# - PyCharm: keyboard shortcut `Control + Shift + F10`
# - Terminal: `python -m doctest -f -v myfile.py`

# %% Imports
import pandas as pd

# %% Types
result: pd.DataFrame

# %% Data
DATA = 'https://python3.info/_static/readcsv-a.csv'

# %% Result
result = ...