Files
2021-08-16 22:20:17 +04:00

24 lines
750 B
Python

#rddaction1.py: rdd action functions
#please ignore next 2 statements if running directly in PySpark shell
import time
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]")\
.appName("RDD Test app")\
.getOrCreate()
data = [5, 4, 6, 3, 2, 8, 9, 2, 8, 7,
8, 4, 4, 8, 2, 7, 8, 9, 6, 9]
rdd1 = spark.sparkContext.parallelize(data)
print("RDD contents with partitions:" + str(rdd1.glom().collect()))
print("Count by values: " +str(rdd1.countByValue()))
print("reduce function: " + str(rdd1.glom().collect()))
print("Sum of RDD contents:"+str(rdd1.sum()))
print("top: " + str(rdd1.top(5)))
print("count: " + str(rdd1.count()))
print("max: "+ str(rdd1.max()))
print("min" + str(rdd1.min()))
time.sleep(60)