Consuming iterator manually

Consuming Iterators Manually

A fairly typical use case for this would be when reading data from a CSV file where you know the first few lines consist of information abotu teh data rather than just the data itself.

with open('cars.csv') as file:
    for line in file:
        print(line)  

Car;MPG;Cylinders;Displacement;Horsepower;Weight;Acceleration;Model;Origin
STRING;DOUBLE;INT;DOUBLE;DOUBLE;DOUBLE;DOUBLE;INT;CAT

Chevrolet Chevelle Malibu;18.0;8;307.0;130.0;3504.;12.0;70;US
Buick Skylark 320;15.0;8;350.0;165.0;3693.;11.5;70;US

As we can see, the values are delimited by ; and the first two lines consist of the column names, and column types.

The reason for the spacing between each line is that each line ends with a newline, and our print statement also emits a newline by default. So we'll have to strip those out.

Here's what we want to do:

  • read the first line to get the column headers and create a named tuple class

  • read data types from second line and store this so we can cast the strings we are reading to the correct data type

  • read the data rows and parse them into a named tuples

with open('cars.csv') as file:
    row_index = 0
    for line in file:
        if row_index == 0:
            # header row
            headers = line.strip('\n').split(';')
            print(headers)
        elif row_index == 1:
            # data type row
            data_types = line.strip('\n').split(';')
            print(data_types)
        else:
            # data rows
            data = line.strip('\n').split(';')
            print(data)
        row_index += 1


['Car', 'MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model', 'Origin']
['STRING', 'DOUBLE', 'INT', 'DOUBLE', 'DOUBLE', 'DOUBLE', 'DOUBLE', 'INT', 'CAT']
['Chevrolet Chevelle Malibu', '18.0', '8', '307.0', '130.0', '3504.', '12.0', '70', 'US']
['Buick Skylark 320', '15.0', '8', '350.0', '165.0', '3693.', '11.5', '70', 'US']
['Plymouth Satellite', '18.0', '8', '318.0', '150.0', '3436.', '11.0', '70', 'US']
......

# Using namedtuple
from collections import namedtuple
cars = []

with open('cars.csv') as file:
    row_index = 0
    for line in file:
        if row_index == 0:
            # header row
            headers = line.strip('\n').split(';')
            Car = namedtuple('Car', headers)
        elif row_index == 1:
            # data type row
            data_types = line.strip('\n').split(';')
            print(data_types)
        else:
            # data rows
            data = line.strip('\n').split(';')
            car = Car(*data)
            cars.append(car)
        row_index += 1

We still need to parse the data into strings, integers, floats... First we need to figure cast to a data type based on the data type string:

  • STRING --> str

  • DOUBLE --> float

  • INT --> int

  • CAT --> str

def cast(data_type, value):
    if data_type == 'DOUBLE':
        return float(value)
    elif data_type == 'INT':
        return int(value)
    else:
        return str(value)

def cast_row(data_types, data_row):
    return [cast(data_type, value) 
            for data_type, value in zip(data_types, data_row)]

Let's go back and fix up our original code now:

from collections import namedtuple
cars = []
with open('cars.csv') as file:
    row_index = 0
    for line in file:
        if row_index == 0:
            # header row
            headers = line.strip('\n').split(';')
            Car = namedtuple('Car', headers)
        elif row_index == 1:
            # data type row
            data_types = line.strip('\n').split(';')
        else:
            # data rows
            data = line.strip('\n').split(';')
            data = cast_row(data_types, data)
            car = Car(*data)
            cars.append(car)
        row_index += 1

We can clean up this code by using iterators directly:

from collections import namedtuple
cars = []

with open('cars.csv') as file:
    file_iter = iter(file)
    headers = next(file_iter).strip('\n').split(';')
    Car = namedtuple('Car', headers)
    data_types = next(file_iter).strip('\n').split(';')
    for line in file_iter:
        data = line.strip('\n').split(';')
        data = cast_row(data_types, data)
        car = Car(*data)
        cars.append(car)

# More cleaner way
from collections import namedtuple

with open('cars.csv') as file:
    file_iter = iter(file)
    headers = next(file_iter).strip('\n').split(';')
    data_types = next(file_iter).strip('\n').split(';')
    cars_data = [cast_row(data_types, 
                          line.strip('\n').split(';'))
                   for line in file_iter]
    cars = [Car(*item) for item in cars_data]

cars_data[0]
['Chevrolet Chevelle Malibu', 18.0, 8, 307.0, 130.0, 3504.0, 12.0, 70, 'US']

Last updated