Files
2021-08-13 11:44:51 +05:30

43 lines
1.3 KiB
Python

#dfcreate1.py: create a df from a collection
#please ignore next 2 statements if running directly in PySpark shell
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, lit, when
spark = SparkSession.builder.master("local[*]")\
.appName("DataFrame Test app")\
.getOrCreate()
data = [('James','','Bylsma','HR','M',40000),
('Kamal','Rahim','','HR','M',41000),
('Robert','','Zaine','Finance','M',35000),
('Sophia','Anne','Richer','Finance','F',47000),
('John','Will','Brown','Engineering','F',65000)
]
columns = ["firstname","middlename","lastname",
"department","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
#show two columns
print(df.select([df.firstname, df.salary]).show())
#replacing values of a columm
myDict = {'F':'Female','M':'Male'}
df2 = df.replace(myDict, subset=['gender'])
#Another way of replacing column values
#df1 = df.withColumn('gender',regexp_replace('gender','M', 'Male'))
#df2 = df1.withColumn('gender',regexp_replace('gender','F', 'Female'))
#adding a new colum Pay Level based on an existing column values
df3 = df2.withColumn("Pay Level",
when((df2.salary < 40000), lit("10")) \
.when((df.salary >= 40000) & (df.salary <= 50000), lit("11")) \
.otherwise(lit("12")) \
)
print(df3.show())
time.sleep(60)