pandas with spark code example
Example 1: python spark dataframe
from pyspark.sql import *
department1 = Row(id='123456', name='Computer Science')
department2 = Row(id='789012', name='Mechanical Engineering')
department3 = Row(id='345678', name='Theater and Drama')
department4 = Row(id='901234', name='Indoor Recreation')
Employee = Row("firstName", "lastName", "email", "salary")
employee1 = Employee('michael', 'armbrust', '[email protected]', 100000)
employee2 = Employee('xiangrui', 'meng', '[email protected]', 120000)
employee3 = Employee('matei', None, '[email protected]', 140000)
employee4 = Employee(None, 'wendell', '[email protected]', 160000)
employee5 = Employee('michael', 'jackson', '[email protected]', 80000)
departmentWithEmployees1 = Row(department=department1, employees=[employee1, employee2])
departmentWithEmployees2 = Row(department=department2, employees=[employee3, employee4])
departmentWithEmployees3 = Row(department=department3, employees=[employee5, employee4])
departmentWithEmployees4 = Row(department=department4, employees=[employee2, employee3])
print(department1)
print(employee2)
print(departmentWithEmployees1.employees[0].email)
Example 2: spark to pandas
pandas_df = some_df.toPandas()