gm_raw = sc.textFile('FileStore/tables/gapminder_data.txt')
for row in gm_raw.take(5):
print(row)
country year continent population life_exp gdp_per_cap gini
Afghanistan 1800 asia 3280000 28.2 603 30.5
Albania 1800 europe 410000 35.4 667 38.9
Algeria 1800 africa 2500000 28.8 715 56.2
Angola 1800 africa 1570000 27 618 57.2
header = gm_raw.take(1)[0].split('\t')
def process_row(row):
tokens = row.split('\t')
return [tokens[0], int(tokens[1]), tokens[2], int(tokens[3]),
float(tokens[4]), int(tokens[5]), float(tokens[6])]
gm = (gm_raw
.filter(lambda x : 'country' not in x)
.map(process_row))
for row in gm.take(5):
print(row)
['Afghanistan', 1800, 'asia', 3280000, 28.2, 603, 30.5]
['Albania', 1800, 'europe', 410000, 35.4, 667, 38.9]
['Algeria', 1800, 'africa', 2500000, 28.8, 715, 56.2]
['Angola', 1800, 'africa', 1570000, 27.0, 618, 57.2]
['Antigua and Barbuda', 1800, 'americas', 37000, 33.5, 757, 40.0]
print('Largest Populations in 2018')
print('-' * 40)
for row in gm_18.sortBy(lambda x : x[3], ascending=False).take(10):
print(f'{row[0]:<30}{row[3]:>10}')
Largest Populations in 2018
----------------------------------------
China 1420000000
India 1350000000
United States 327000000
Indonesia 267000000
Brazil 211000000
Pakistan 201000000
Nigeria 196000000
Bangladesh 166000000
Russia 144000000
Mexico 131000000
print('Smallest Populations in 2018')
print('-' * 40)
for row in gm_18.sortBy(lambda x : x[3]).take(10):
print(f'{row[0]:<30}{row[3]:>10}')
Smallest Populations in 2018
----------------------------------------
Seychelles 95200
Antigua and Barbuda 103000
Micronesia, Fed. Sts. 106000
Grenada 108000
Tonga 109000
St. Vincent and the Grenadines 110000
Kiribati 118000
St. Lucia 180000
Samoa 198000
Sao Tome and Principe 209000
print('Highest Life Expectancy in 2018')
print('-' * 40)
for row in gm_18.sortBy(lambda x : x[4], ascending=False).take(10):
print(f'{row[0]:<30}{row[4]:>10}')
Highest Life Expectancy in 2018
----------------------------------------
Japan 84.2
Singapore 84.0
Switzerland 83.5
Spain 83.2
Australia 82.9
France 82.6
Iceland 82.6
Italy 82.6
Israel 82.4
Luxembourg 82.4
print('Lowest Life Expectancy in 2018')
print('-' * 40)
for row in gm_18.sortBy(lambda x : x[4]).take(10):
print(f'{row[0]:<30}{row[4]:>10}')
Lowest Life Expectancy in 2018
----------------------------------------
Lesotho 51.1
Central African Republic 51.6
Somalia 58.0
Swaziland 58.6
Afghanistan 58.7
Zambia 59.5
Guinea-Bissau 59.7
Sierra Leone 60.0
Zimbabwe 60.2
Chad 60.5
Lesson 12 - Example: Gapminder Dataset