A fairly typical use case for this would be when reading data from a CSV file where you know the first few lines consist of information abotu teh data rather than just the data itself.
with open('cars.csv') as file:
for line in file:
print(line)
Car;MPG;Cylinders;Displacement;Horsepower;Weight;Acceleration;Model;Origin
STRING;DOUBLE;INT;DOUBLE;DOUBLE;DOUBLE;DOUBLE;INT;CAT
Chevrolet Chevelle Malibu;18.0;8;307.0;130.0;3504.;12.0;70;US
Buick Skylark 320;15.0;8;350.0;165.0;3693.;11.5;70;US
As we can see, the values are delimited by ; and the first two lines consist of the column names, and column types.
The reason for the spacing between each line is that each line ends with a newline, and our print statement also emits a newline by default. So we'll have to strip those out.
Here's what we want to do:
read the first line to get the column headers and create a named tuple class
read data types from second line and store this so we can cast the strings we are reading to the correct data type
read the data rows and parse them into a named tuples
with open('cars.csv') as file:
row_index = 0
for line in file:
if row_index == 0:
# header row
headers = line.strip('\n').split(';')
print(headers)
elif row_index == 1:
# data type row
data_types = line.strip('\n').split(';')
print(data_types)
else:
# data rows
data = line.strip('\n').split(';')
print(data)
row_index += 1
['Car', 'MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model', 'Origin']
['STRING', 'DOUBLE', 'INT', 'DOUBLE', 'DOUBLE', 'DOUBLE', 'DOUBLE', 'INT', 'CAT']
['Chevrolet Chevelle Malibu', '18.0', '8', '307.0', '130.0', '3504.', '12.0', '70', 'US']
['Buick Skylark 320', '15.0', '8', '350.0', '165.0', '3693.', '11.5', '70', 'US']
['Plymouth Satellite', '18.0', '8', '318.0', '150.0', '3436.', '11.0', '70', 'US']
......
# Using namedtuple
from collections import namedtuple
cars = []
with open('cars.csv') as file:
row_index = 0
for line in file:
if row_index == 0:
# header row
headers = line.strip('\n').split(';')
Car = namedtuple('Car', headers)
elif row_index == 1:
# data type row
data_types = line.strip('\n').split(';')
print(data_types)
else:
# data rows
data = line.strip('\n').split(';')
car = Car(*data)
cars.append(car)
row_index += 1
We still need to parse the data into strings, integers, floats... First we need to figure cast to a data type based on the data type string:
STRING --> str
DOUBLE --> float
INT --> int
CAT --> str
def cast(data_type, value):
if data_type == 'DOUBLE':
return float(value)
elif data_type == 'INT':
return int(value)
else:
return str(value)
def cast_row(data_types, data_row):
return [cast(data_type, value)
for data_type, value in zip(data_types, data_row)]
Let's go back and fix up our original code now:
from collections import namedtuple
cars = []
with open('cars.csv') as file:
row_index = 0
for line in file:
if row_index == 0:
# header row
headers = line.strip('\n').split(';')
Car = namedtuple('Car', headers)
elif row_index == 1:
# data type row
data_types = line.strip('\n').split(';')
else:
# data rows
data = line.strip('\n').split(';')
data = cast_row(data_types, data)
car = Car(*data)
cars.append(car)
row_index += 1
We can clean up this code by using iterators directly:
from collections import namedtuple
cars = []
with open('cars.csv') as file:
file_iter = iter(file)
headers = next(file_iter).strip('\n').split(';')
Car = namedtuple('Car', headers)
data_types = next(file_iter).strip('\n').split(';')
for line in file_iter:
data = line.strip('\n').split(';')
data = cast_row(data_types, data)
car = Car(*data)
cars.append(car)
# More cleaner way
from collections import namedtuple
with open('cars.csv') as file:
file_iter = iter(file)
headers = next(file_iter).strip('\n').split(';')
data_types = next(file_iter).strip('\n').split(';')
cars_data = [cast_row(data_types,
line.strip('\n').split(';'))
for line in file_iter]
cars = [Car(*item) for item in cars_data]
cars_data[0]
['Chevrolet Chevelle Malibu', 18.0, 8, 307.0, 130.0, 3504.0, 12.0, 70, 'US']