from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf=conf)
filename = 'a.txt'
lines = sc.textFile(filename)
lines = sc.parallelize(["a", "b"])
filename = "data.json"
people = sc.read.json(filename)
people.registerTempTable("people")
How to load directory of JSON files into Apache Spark in Python
my_RDD_strings = sc.textFile(path_to_dir_with_JSON_files)
my_RDD_dictionaries = my_RDD_strings.map(json.loads)
from ... import SQLContext
sql = SQLContext(sc)
filename = "data.json"
people = sql.jsonFile(filename)
people.registerTempTable("people")
sum = series.reduce(lambda x, y: x + y)
rdd.collect()
num_elems = rdd.count()
tuples = rdd.countByValue()
a = b.fold(0)
# Apply `func` to `rdd`
rdd.forEach(func)
# Take first 10 values
values = rdd.take(10)
# Take top 10 values
values = rdd.top(10)
# Take 10 samples with replacement
values = rdd.takeSample(True, 10)
input = sc.textFile('a.txt')
errors = input.filter(lambda x: 'errors' in x)
badlines = errors.union(warnings)
both_attr = tall.intersection(slim)
only_tall = tall.subtract(slim)
unique = companies.distinct()
squares = data.map(lambda x: x * x)
## Flatmap
words = lines.flatMap(lambda line: line.split(" "))
# Numeric operations
## count
## mean
## sum
## max
## min
## variance / sampleVariance
## stdev / sampleStdev
#
## Accumulator
... = sc.accumulator(0)
## Broadcast
... = sc.broadcast(...)
## Actions on single pair RDDs
countByKey
collectAsMap
lookup
## Transformations on two pair RDDs
subtractByKey
join
rightOuterJoin
leftOuterJoin
cogroup
## Transformations on single pair RDD
reduceByKye
groupByKey
combineByKey
mapValues
flatmapvalues
keys
values
sortByKeys
## Dataframe operations
show
select
filter
groupBy