my_schema = 'name STRING, x1 DOUBLE, x2 INTEGER'
data = [
['Emma White', 5.2, 215],
['Art Brown', 4.1, 473],
['Carly Black', 3.7, 260],
['Beth Green', 4.5, 303],
['Dan Gray', 2.9, 185]
]
df = spark.createDataFrame(data, schema=my_schema)
df.show()
+-----------+---+---+
| name| x1| x2|
+-----------+---+---+
| Emma White|5.2|215|
| Art Brown|4.1|473|
|Carly Black|3.7|260|
| Beth Green|4.5|303|
| Dan Gray|2.9|185|
+-----------+---+---+
df.select(
F.upper(col('name')).alias('name_upper'),
F.length(col('name')).alias('name_length'),
F.exp(col('x1')).alias('exp_x1'),
F.log(col('x2')).alias('log_x2')
).show()
+-----------+-----------+------------------+------------------+
| name_upper|name_length| exp_x1| log_x2|
+-----------+-----------+------------------+------------------+
| EMMA WHITE| 10|181.27224187515122|5.3706380281276624|
| ART BROWN| 9| 60.34028759736195| 6.159095388491933|
|CARLY BLACK| 11| 40.4473043600674| 5.560681631015528|
| BETH GREEN| 10| 90.01713130052181| 5.713732805509369|
| DAN GRAY| 8| 18.17414536944306| 5.220355825078325|
+-----------+-----------+------------------+------------------+
df.select(
F.upper(col('name')).alias('name_upper'),
F.length(col('name')).alias('name_length'),
F.round(F.exp(col('x1')),2).alias('exp_x1'),
F.round(F.log(col('x2')),4).alias('log_x2')
).show()
+-----------+-----------+------+------+
| name_upper|name_length|exp_x1|log_x2|
+-----------+-----------+------+------+
| EMMA WHITE| 10|181.27|5.3706|
| ART BROWN| 9| 60.34|6.1591|
|CARLY BLACK| 11| 40.45|5.5607|
| BETH GREEN| 10| 90.02|5.7137|
| DAN GRAY| 8| 18.17|5.2204|
+-----------+-----------+------+------+
df.select(
expr('UPPER(name) AS name_upper'),
expr('LENGTH(name) AS name_length'),
expr('ROUND(EXP(x1), 2) AS exp_x1'),
expr('ROUND(LOG(x2), 4) AS log_x2')
).show()
+-----------+-----------+------+------+
| name_upper|name_length|exp_x1|log_x2|
+-----------+-----------+------+------+
| EMMA WHITE| 10|181.27|5.3706|
| ART BROWN| 9| 60.34|6.1591|
|CARLY BLACK| 11| 40.45|5.5607|
| BETH GREEN| 10| 90.02|5.7137|
| DAN GRAY| 8| 18.17|5.2204|
+-----------+-----------+------+------+
df.select(
F.sum(col('x2')).alias('sum_x2'),
F.mean(col('x2')).alias('mean_x2'),
F.stddev(col('x2')).alias('stddev_x2'),
F.min(col('x2')).alias('min_x2'),
F.max(col('x2')).alias('max_x2')
).show()
+------+-------+------------------+------+------+
|sum_x2|mean_x2| stddev_x2|min_x2|max_x2|
+------+-------+------------------+------+------+
| 1436| 287.2|113.10260828115327| 185| 473|
+------+-------+------------------+------+------+
Lesson 17 - Column Functions