Beginning a Jupyter notebook for PySpark!
Initial Configuration - Python libraries, pandas, numpy, and pyspark
# Spark!
from pyspark import SparkContext
from pyspark.sql.functions import *
spark = SparkSession.builder.appName("myapp").getOrCreate()
# spark = SparkSession.builder.master("yarn")\
# .config("spark.executor.instances", "32")\
# .config("spark.executor.cores", "4")\
# .config("spark.executor.memory", "4G")\
# .config("spark.driver.memory", "4G")\
# .config("spark.executor.memoryOverhead","4G")\
# .config("spark.yarn.queue","Medium")\
# .appName("myapp")\
# .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.sources.partitionColumnTypeInference.enabled", "false")
spark.conf.set("spark.debug.maxToStringFields","true")
Initial Configuration + DataFrameBuild - My paths and libraries
from shared.app_context import *
from builder.DataFrameBuild import *
ctx = ApplicationContext("Dev-Job")
print(ctx.spark)
DFB = DataFrameBuild(ctx.spark)