Skip to content

Instantly share code, notes, and snippets.

@wolframalpha
Created December 11, 2017 10:18
Show Gist options
  • Select an option

  • Save wolframalpha/f90e5f1487f4ce67f3d1692b91088ec1 to your computer and use it in GitHub Desktop.

Select an option

Save wolframalpha/f90e5f1487f4ce67f3d1692b91088ec1 to your computer and use it in GitHub Desktop.
from pyspark import SparkConf,SparkContext
from pyspark.sql.functions import *
from pyspark.sql import *
from pyspark.sql.types import *
configs = [('spark.eventLog.enabled', 'true'),
('spark.dynamicAllocation.minExecutors', '8'),
('spark.executor.instances', '1000'),
('spark.driver.host', '10.142.0.3'),
('spark.yarn.am.memory', '640m'),
('spark.executor.cores', '4'),
('spark.driver.appUIAddress', 'http://10.142.0.3:4040'),
('spark.driver.port', '35745'),
('spark.executor.extraJavaOptions', ''),
('spark.serializer.objectStreamReset', '100'),
('spark.submit.deployMode', 'client'),
('spark.ui.filters',
'org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter'),
('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES',
'http://meow-3214-m:8088/proxy/application_1512711270974_0003'),
('spark.driver.maxResultSize', '1020m'),
('spark.ui.proxyBase', '/proxy/application_1512711270974_0003'),
('spark.shuffle.service.enabled', 'true'),
('spark.yarn.jars', 'local:/usr/lib/spark/jars/*'),
('spark.scheduler.minRegisteredResourcesRatio', '0.0'),
('spark.executor.id', 'driver'),
('spark.eventLog.dir', 'hdfs://meow-3214-m/user/spark/eventlog'),
('spark.yarn.historyServer.address', 'meow-3214-m:18080'),
('spark.executor.memory', '2688m'),
('spark.app.name', 'pyspark-shell'),
('spark.dynamicAllocation.maxExecutors', '4000'),
('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS',
'meow-3214-m'),
('spark.app.id', 'application_1512711270974_0003'),
('spark.executorEnv.PYTHONPATH',
'/usr/lib/spark/python/:/usr/lib/spark/python/lib/py4j-0.10.4-src.zip<CPS>{{PWD}}/pyspark.zip<CPS>{{PWD}}/py4j-0.10.4-src.zip'),
('spark.master', 'yarn'),
('spark.sql.catalogImplementation', 'hive'),
('spark.executorEnv.PYTHONHASHSEED', '0'),
('spark.rpc.message.maxSize', '512'),
('spark.rdd.compress', 'True'),
('spark.driver.memory', '1040m'),
('spark.yarn.isPython', 'true'),
('spark.sql.parquet.cacheMetadata', 'false'),
('spark.dynamicAllocation.enabled', 'true'),
('spark.history.fs.logDirectory', 'hdfs://meow-3214-m/user/spark/eventlog'),
('spark.driver.extraJavaOptions', '')]
conf = SparkConf()
conf.setAll(configs)
sc.stop()
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
headers = [u'is_online_order', u'pos_disc_code', u'brand',
u'style_code', u'oid_retail_product', u'oid_retail_outlet',
u'category_code_lvl_1', u'category_code_lvl_2', u'category_code_lvl_3',
u'category_code_lvl_4', u'category_code_lvl_5', u'category_code_lvl_6',
u'category_code_lvl_7', u'category_code_lvl_8', u'category_desc_lvl_1',
u'category_desc_lvl_2', u'category_desc_lvl_3', u'category_desc_lvl_4',
u'category_desc_lvl_5', u'category_desc_lvl_6', u'category_desc_lvl_7',
u'category_desc_lvl_8', u'greg_year', u'greg_week_num',
u'tot_exchange_amt', u'tot_exchange_cnt', u'tot_extended_cost_of_goods',
u'tot_reg_price', u'tot_profit', u'tot_revenue', u'avg_reg_price',
u'tot_item_qty', u'tot_return_amt', u'tot_return_cnt',
u'tot_discount_amt', u'tot_coupon_amt', u'avg_otd_unit',
u'avg_cost_unit', u'avg_profit_unit', u'count_unadvertised_price_d',
u'count_advertised_promo', u'count_clearance_price',
u'count_regular_sale', u'count_return_item', u'count_no_discount',
u'count_shopko_store_coupon', u'count_no_coupon']
df = spark.read.csv("gs://shopko-data/data/", inferSchema=True).rdd.toDF(headers)
df = df.repartition(36*3)
df = df.cache()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment