Last active
October 24, 2016 23:25
-
-
Save adgaudio/0191e14717af68bbba81 to your computer and use it in GitHub Desktop.
This gist demonstrates that spark 0.9.1 (and I'm guessing also 1.0.0) don't serialize a logger instance properly when code runs on workers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This gist demonstrates that spark 1.0.0 and 0.9.1 | |
don't serialize a logger instance properly when code runs on workers. | |
run this code via: | |
spark-submit spark_serialization_demo.py | |
- or - | |
pyspark spark_serialization_demo.py | |
""" | |
import pyspark | |
from os.path import abspath | |
import logging | |
# initialize logger | |
log = logging.getLogger('alexTest') | |
_h = logging.StreamHandler() | |
_h.setFormatter(logging.Formatter("%(levelname)s %(msg)s")) | |
log.addHandler(_h) | |
log.setLevel(logging.DEBUG) | |
log.info("module imported and logger initialized") | |
FUNC = 'passes()' | |
def myfunc(*ignore_args): | |
log.debug('logging a line from: %s' % FUNC) | |
return 0 | |
def passes(): | |
mycode_module = __import__('spark_serialization_demo') | |
print(textFile.map(mycode_module.myfunc, preservesPartitioning=True).take(5)) | |
def fails(): | |
print(textFile.map(myfunc, preservesPartitioning=True).take(5)) | |
raise Exception("Never reach this point because code fails first due to serialization error") | |
if __name__ == '__main__': | |
sc = pyspark.SparkContext("local[10]", 'test') | |
textFile = sc.textFile("file://%s" % abspath(__file__), 5) | |
print('\n\n---') | |
FUNC = 'fails()' | |
log.info( | |
"This example fails because it serializes a function that" | |
"does not initialize the logger when the function is unserialized") | |
try: | |
fails() | |
except Exception as err: | |
log.error("See, I failed! Details: %s" % err) | |
print('\n---') | |
log.info( | |
"This example passes because it serializes a module that initializes" | |
" the logger when the module is unserialized") | |
passes() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Did some digging :
https://medium.com/@anicolaspp/how-to-log-in-apache-spark-f4204fad78a#.cq6k7d7x8
It will be great if someone can convert the code in python