How to access SparkContext in pyspark script
Standalone python script for wordcount : write a reusable spark context by using contextmanager
"""SimpleApp.py"""
from contextlib import contextmanager
from pyspark import SparkContext
from pyspark import SparkConf
SPARK_MASTER='local'
SPARK_APP_NAME='Word Count'
SPARK_EXECUTOR_MEMORY='200m'
@contextmanager
def spark_manager():
conf = SparkConf().setMaster(SPARK_MASTER) \
.setAppName(SPARK_APP_NAME) \
.set("spark.executor.memory", SPARK_EXECUTOR_MEMORY)
spark_context = SparkContext(conf=conf)
try:
yield spark_context
finally:
spark_context.stop()
with spark_manager() as context:
File = "/home/ramisetty/sparkex/README.md" # Should be some file on your system
textFileRDD = context.textFile(File)
wordCounts = textFileRDD.flatMap(lambda line: line.split()).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b)
wordCounts.saveAsTextFile("output")
print "WordCount - Done"
to launch:
/bin/spark-submit SimpleApp.py
If you created a already a SparkSession:
spark = SparkSession \
.builder \
.appName("StreamKafka_Test") \
.getOrCreate()
Then you can access the "existing" SparkContext like this:
sc = spark.sparkContext
Include the following:
from pyspark.context import SparkContext
and then invoke a static method on SparkContext
as:
sc = SparkContext.getOrCreate()