Created
November 30, 2018 10:25
-
-
Save korkridake/f057651fce2bd4a8778f8e9781459368 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
print(sc) | |
print(spark) | |
# <pyspark.sql.session.SparkSession at 0x7f8df8673ba8> | |
# ------------------------------------------------------------------------------- | |
# Import PySpark Libraries | |
# ------------------------------------------------------------------------------- | |
import datetime | |
from datetime import datetime | |
from pyspark.sql.functions import skewness, kurtosis | |
from pyspark.sql.functions import var_pop, var_samp, stddev, stddev_pop, sumDistinct, ntile | |
#'udf' stands for 'user defined function', and is simply a wrapper for functions you write and | |
#want to apply to a column that knows how to iterate through pySpark dataframe columns. it should | |
#be more clear after we use it below | |
from pyspark.sql.functions import udf | |
from pyspark.sql.functions import col | |
from pyspark.sql.types import IntegerType | |
from pyspark.sql.types import StringType | |
from pyspark.sql.types import DateType | |
from pyspark.sql import DataFrame | |
from pyspark.sql import Row | |
from functools import reduce | |
# <pyspark.sql.session.SparkSession object at 0x7f7e362177f0> | |
# Spark uses a cluster manager (e.g., Spark's own standalone manager, YARN or Mesos), and a number of worker nodes. | |
# The manager attempts to acquire executors on the worker nodes, which do computations and store data based on the code and tasks that are sent to them. | |
# Spark's primary abstraction is a so-called Resilient Distributed Dataset (RDD). Spark can create RDDs from any storage source supported by Hadoop. An RDD holds intermediate computational results and is stored in RAM or on disk across the worker nodes. In case a node fails, an RDD can be restored. Many processes can executed in parallel thanks to the distributed nature of RDDs, and pipelining and lazy execution prevent the need for saving intermediate results for the next step. Importantly, Spark supports pulling data sets into a cluster-wide in-memory cache for fast access. | |
# RDD operations can be divided into 2 groups: transformations and actions. Transformations (e.g., map) of RDDs always result in new RDDs, and actions (e.g., reduce) return values that are the result of operations on the RDD back to the driver program. | |
# 'parallelize' creates an RDD by distributing data over the cluster | |
rdd = sc.parallelize(range(14), numSlices=4) | |
print("Number of partitions: {}".format(rdd.getNumPartitions())) | |
# 'glom' lists all elements within each partition | |
print(rdd.glom().collect()) | |
# Number of partitions: 4 | |
# [[0, 1, 2], [3, 4, 5, 6], [7, 8, 9], [10, 11, 12, 13]] | |
# ------------------------------------------------------------------------------- | |
# ------------------------------------------------------------------------------- | |
# RDDs are distributed data sets !!! | |
# ------------------------------------------------------------------------------- | |
# ------------------------------------------------------------------------------- | |
# 'parallelize' creates an RDD by distributing data over the cluster | |
rdd = sc.parallelize(range(100), numSlices=20) | |
print("Number of partitions: {}".format(rdd.getNumPartitions())) | |
# 'glom' lists all elements within each partition | |
print(rdd.glom().collect()) | |
# Number of partitions: 20 | |
# # [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17, 18, 19], [20, 21, 22, 23, 24], [25, 26, 27, 28, 29], [30, 31, 32, 33, 34], [35, 36, 37, 38, 39], [40, 41, 42, 43, 44], [45, 46, 47, 48, 49], [50, 51, 52, 53, 54], [55, 56, 57, 58, 59], [60, 61, 62, 63, 64], [65, 66, 67, 68, 69], [70, 71, 72, 73, 74], [75, 76, 77, 78, 79], [80, 81, 82, 83, 84], [85, 86, 87, 88, 89], [90, 91, 92, 93, 94], [95, 96, 97, 98, 99]] | |
rddSquared = rdd.map(lambda x: x ** 2) | |
# Alternatively, you can use a normal function: | |
# def squared(x): | |
# return x ** 2 | |
# rddSquared = rdd.map(squared) | |
# The 'collect' action triggers Spark: the above transformation is performed, | |
# and results are collected. | |
print(rddSquared.glom().collect()) | |
# [[0, 1, 4, 9, 16], [25, 36, 49, 64, 81], [100, 121, 144, 169, 196], [225, 256, 289, 324, 361], [400, 441, 484, 529, 576], [625, 676, 729, 784, 841], [900, 961, 1024, 1089, 1156], [1225, 1296, 1369, 1444, 1521], [1600, 1681, 1764, 1849, 1936], [2025, 2116, 2209, 2304, 2401], [2500, 2601, 2704, 2809, 2916], [3025, 3136, 3249, 3364, 3481], [3600, 3721, 3844, 3969, 4096], [4225, 4356, 4489, 4624, 4761], [4900, 5041, 5184, 5329, 5476], [5625, 5776, 5929, 6084, 6241], [6400, 6561, 6724, 6889, 7056], [7225, 7396, 7569, 7744, 7921], [8100, 8281, 8464, 8649, 8836], [9025, 9216, 9409, 9604, 9801]] | |
# Compare it with: | |
print(rddSquared.collect()) | |
# [0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256, 289, 324, 361, 400, 441, 484, 529, 576, 625, 676, 729, 784, 841, 900, 961, 1024, 1089, 1156, 1225, 1296, 1369, 1444, 1521, 1600, 1681, 1764, 1849, 1936, 2025, 2116, 2209, 2304, 2401, 2500, 2601, 2704, 2809, 2916, 3025, 3136, 3249, 3364, 3481, 3600, 3721, 3844, 3969, 4096, 4225, 4356, 4489, 4624, 4761, 4900, 5041, 5184, 5329, 5476, 5625, 5776, 5929, 6084, 6241, 6400, 6561, 6724, 6889, 7056, 7225, 7396, 7569, 7744, 7921, 8100, 8281, 8464, 8649, 8836, 9025, 9216, 9409, 9604, 9801] | |
# ----------------------- | |
# Popular transformations | |
# ----------------------- | |
func = lambda x: -x | |
rdd.map(func) | |
rdd.flatMap(func) # like map, but flattens results | |
rdd.filter(func) | |
rdd.sortBy(func) | |
# ----------------------- | |
# Popular actions | |
# ----------------------- | |
rdd.reduce(lambda x, y: x + y) | |
rdd.count() | |
# Actions with which to take data from an RDD: | |
print(rdd.collect()) # get all elements | |
print(rdd.first()) # get first element | |
print(rdd.take(5)) # get N first elements | |
print(rdd.top(3)) # get N highest elements in descending order | |
print(rdd.takeOrdered(7, lambda x: -x)) # get N first elements in ascending (or a function's) order | |
# [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99] | |
# 0 | |
# [0, 1, 2, 3, 4] | |
# [99, 98, 97] | |
# [99, 98, 97, 96, 95, 94, 93] | |
type(rdd) | |
# Out[13]: pyspark.rdd.PipelinedRDD | |
# PairRDDs are automatically created whenever we present a list of key-value tuples | |
# Here we transform rddA and create a key based on even/odd flags | |
rddP1 = rdd.map(lambda x: (x % 2 == 0, x)) | |
# A clearer shortcut for this is: | |
rddP1 = rdd.keyBy(lambda x: x % 2 == 0) | |
# Another way to create a PairRDD is to zip two RDDs (assumes equal length RDDs) | |
print("Zipped: {}".format(rdd.zip(rdd).collect())) | |
# Zipped: [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 9), (10, 10), (11, 11), (12, 12), (13, 13), (14, 14), (15, 15), (16, 16), (17, 17), (18, 18), (19, 19), (20, 20), (21, 21), (22, 22), (23, 23), (24, 24), (25, 25), (26, 26), (27, 27), (28, 28), (29, 29), (30, 30), (31, 31), (32, 32), (33, 33), (34, 34), (35, 35), (36, 36), (37, 37), (38, 38), (39, 39), (40, 40), (41, 41), (42, 42), (43, 43), (44, 44), (45, 45), (46, 46), (47, 47), (48, 48), (49, 49), (50, 50), (51, 51), (52, 52), (53, 53), (54, 54), (55, 55), (56, 56), (57, 57), (58, 58), (59, 59), (60, 60), (61, 61), (62, 62), (63, 63), (64, 64), (65, 65), (66, 66), (67, 67), (68, 68), (69, 69), (70, 70), (71, 71), (72, 72), (73, 73), (74, 74), (75, 75), (76, 76), (77, 77), (78, 78), (79, 79), (80, 80), (81, 81), (82, 82), (83, 83), (84, 84), (85, 85), (86, 86), (87, 87), (88, 88), (89, 89), (90, 90), (91, 91), (92, 92), (93, 93), (94, 94), (95, 95), (96, 96), (97, 97), (98, 98), (99, 99)] | |
# Access to the keys and values | |
print("Keys: {}".format(rddP1.keys().collect())) | |
print("Values: {}".format(rddP1.values().collect())) | |
# Keys: [True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False, True, False] | |
# Values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99] | |
# This is how you can map a function to a pairRDD; x[0] is the key, x[1] the value | |
print(rddP1.map(lambda x: (x[0], x[1] ** 2)).collect()) | |
# [(True, 0), (False, 1), (True, 4), (False, 9), (True, 16), (False, 25), (True, 36), (False, 49), (True, 64), (False, 81), (True, 100), (False, 121), (True, 144), (False, 169), (True, 196), (False, 225), (True, 256), (False, 289), (True, 324), (False, 361), (True, 400), (False, 441), (True, 484), (False, 529), (True, 576), (False, 625), (True, 676), (False, 729), (True, 784), (False, 841), (True, 900), (False, 961), (True, 1024), (False, 1089), (True, 1156), (False, 1225), (True, 1296), (False, 1369), (True, 1444), (False, 1521), (True, 1600), (False, 1681), (True, 1764), (False, 1849), (True, 1936), (False, 2025), (True, 2116), (False, 2209), (True, 2304), (False, 2401), (True, 2500), (False, 2601), (True, 2704), (False, 2809), (True, 2916), (False, 3025), (True, 3136), (False, 3249), (True, 3364), (False, 3481), (True, 3600), (False, 3721), (True, 3844), (False, 3969), (True, 4096), (False, 4225), (True, 4356), (False, 4489), (True, 4624), (False, 4761), (True, 4900), (False, 5041), (True, 5184), (False, 5329), (True, 5476), (False, 5625), (True, 5776), (False, 5929), (True, 6084), (False, 6241), (True, 6400), (False, 6561), (True, 6724), (False, 6889), (True, 7056), (False, 7225), (True, 7396), (False, 7569), (True, 7744), (False, 7921), (True, 8100), (False, 8281), (True, 8464), (False, 8649), (True, 8836), (False, 9025), (True, 9216), (False, 9409), (True, 9604), (False, 9801)] | |
# Better: mapValues/flatMapValues, which operates on values only and keeps the keys in place | |
print(rddP1.mapValues(lambda x: x ** 2).collect()) | |
# [(True, 0), (False, 1), (True, 4), (False, 9), (True, 16), (False, 25), (True, 36), (False, 49), (True, 64), (False, 81), (True, 100), (False, 121), (True, 144), (False, 169), (True, 196), (False, 225), (True, 256), (False, 289), (True, 324), (False, 361), (True, 400), (False, 441), (True, 484), (False, 529), (True, 576), (False, 625), (True, 676), (False, 729), (True, 784), (False, 841), (True, 900), (False, 961), (True, 1024), (False, 1089), (True, 1156), (False, 1225), (True, 1296), (False, 1369), (True, 1444), (False, 1521), (True, 1600), (False, 1681), (True, 1764), (False, 1849), (True, 1936), (False, 2025), (True, 2116), (False, 2209), (True, 2304), (False, 2401), (True, 2500), (False, 2601), (True, 2704), (False, 2809), (True, 2916), (False, 3025), (True, 3136), (False, 3249), (True, 3364), (False, 3481), (True, 3600), (False, 3721), (True, 3844), (False, 3969), (True, 4096), (False, 4225), (True, 4356), (False, 4489), (True, 4624), (False, 4761), (True, 4900), (False, 5041), (True, 5184), (False, 5329), (True, 5476), (False, 5625), (True, 5776), (False, 5929), (True, 6084), (False, 6241), (True, 6400), (False, 6561), (True, 6724), (False, 6889), (True, 7056), (False, 7225), (True, 7396), (False, 7569), (True, 7744), (False, 7921), (True, 8100), (False, 8281), (True, 8464), (False, 8649), (True, 8836), (False, 9025), (True, 9216), (False, 9409), (True, 9604), (False, 9801)] | |
# We can also go back from a PairRDD to a normal RDD by simply dropping the key | |
print(rddP1.map(lambda x: x[1] ** 2).collect()) | |
# [0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256, 289, 324, 361, 400, 441, 484, 529, 576, 625, 676, 729, 784, 841, 900, 961, 1024, 1089, 1156, 1225, 1296, 1369, 1444, 1521, 1600, 1681, 1764, 1849, 1936, 2025, 2116, 2209, 2304, 2401, 2500, 2601, 2704, 2809, 2916, 3025, 3136, 3249, 3364, 3481, 3600, 3721, 3844, 3969, 4096, 4225, 4356, 4489, 4624, 4761, 4900, 5041, 5184, 5329, 5476, 5625, 5776, 5929, 6084, 6241, 6400, 6561, 6724, 6889, 7056, 7225, 7396, 7569, 7744, 7921, 8100, 8281, 8464, 8649, 8836, 9025, 9216, 9409, 9604, 9801] | |
# Various aggregations by key are possible, such as reduceByKey, combineByKey, and foldByKey | |
# reduceByKey example: | |
print("Sum per key: {}".format(rddP1.reduceByKey(lambda x, y: x + y).collect())) | |
# Also, some common operation are available in 'ByKey' form, e.g.: | |
rddP1.sortByKey() | |
rddP1.countByKey() | |
# Sum per key: [(False, 2500), (True, 2450)] | |
# Out[20]: defaultdict(<class 'int'>, {False: 50, True: 50}) | |
# Grouping and joining by key | |
# There are various possible ways of joining 2 RDDs together by key: | |
rddP2 = sc.parallelize(range(0, 28, 2)).map(lambda x: (x % 2 == 0, x)) | |
# inner join, or cross join in case of overlapping keys | |
print("Join: {}".format(rddP1.join(rddP2).collect())) | |
# left/right outer join | |
rddP1.leftOuterJoin(rddP2) | |
rddP1.rightOuterJoin(rddP2) | |
# for all keys in either rddP1 or rddP2, cogroup returns iterables of the values in either | |
print("Cogroup: {}".format(rddP1.cogroup(rddP2).collect())) | |
# cogrouping together more than two RDDs by key can be done with groupWith | |
rddP1.groupWith(rddP2, rddP2) | |
# with groupByKey we create a new RDD that keeps the same keys on the same node, where possible | |
print("After groupByKey: {}".format(rddP1.groupByKey().glom().collect())) | |
# Join: [(True, (6, 10)), (True, (6, 12)), (True, (6, 22)), (True, (6, 24)), (True, (6, 26)), (True, (6, 0)), (True, (6, 2)), (True, (6, 4)), (True, (6, 6)), (True, (6, 8)), (True, (6, 14)), (True, (6, 16)), (True, (6, 18)), (True, (6, 20)), (True, (8, 10)), (True, (8, 12)), (True, (8, 22)), (True, (8, 24)), (True, (8, 26)), (True, (8, 0)), (True, (8, 2)), (True, (8, 4)), (True, (8, 6)), (True, (8, 8)), (True, (8, 14)), (True, (8, 16)), (True, (8, 18)), (True, (8, 20)), (True, (16, 10)), (True, (16, 12)), (True, (16, 22)), (True, (16, 24)), (True, (16, 26)), (True, (16, 0)), (True, (16, 2)), (True, (16, 4)), (True, (16, 6)), (True, (16, 8)), (True, (16, 14)), (True, (16, 16)), (True, (16, 18)), (True, (16, 20)), (True, (18, 10)), (True, (18, 12)), (True, (18, 22)), (True, (18, 24)), (True, (18, 26)), (True, (18, 0)), (True, (18, 2)), (True, (18, 4)), (True, (18, 6)), (True, (18, 8)), (True, (18, 14)), (True, (18, 16)), (True, (18, 18)), (True, (18, 20)), (True, (26, 10)), (True, (26, 12)), (True, (26, 22)), (True, (26, 24)), (True, (26, 26)), (True, (26, 0)), (True, (26, 2)), (True, (26, 4)), (True, (26, 6)), (True, (26, 8)), (True, (26, 14)), (True, (26, 16)), (True, (26, 18)), (True, (26, 20)), (True, (28, 10)), (True, (28, 12)), (True, (28, 22)), (True, (28, 24)), (True, (28, 26)), (True, (28, 0)), (True, (28, 2)), (True, (28, 4)), (True, (28, 6)), (True, (28, 8)), (True, (28, 14)), (True, (28, 16)), (True, (28, 18)), (True, (28, 20)), (True, (36, 10)), (True, (36, 12)), (True, (36, 22)), (True, (36, 24)), (True, (36, 26)), (True, (36, 0)), (True, (36, 2)), (True, (36, 4)), (True, (36, 6)), (True, (36, 8)), (True, (36, 14)), (True, (36, 16)), (True, (36, 18)), (True, (36, 20)), (True, (38, 10)), (True, (38, 12)), (True, (38, 22)), (True, (38, 24)), (True, (38, 26)), (True, (38, 0)), (True, (38, 2)), (True, (38, 4)), (True, (38, 6)), (True, (38, 8)), (True, (38, 14)), (True, (38, 16)), (True, (38, 18)), (True, (38, 20)), (True, (46, 10)), (True, (46, 12)), (True, (46, 22)), (True, (46, 24)), (True, (46, 26)), (True, (46, 0)), (True, (46, 2)), (True, (46, 4)), (True, (46, 6)), (True, (46, 8)), (True, (46, 14)), (True, (46, 16)), (True, (46, 18)), (True, (46, 20)), (True, (48, 10)), (True, (48, 12)), (True, (48, 22)), (True, (48, 24)), (True, (48, 26)), (True, (48, 0)), (True, (48, 2)), (True, (48, 4)), (True, (48, 6)), (True, (48, 8)), (True, (48, 14)), (True, (48, 16)), (True, (48, 18)), (True, (48, 20)), (True, (56, 10)), (True, (56, 12)), (True, (56, 22)), (True, (56, 24)), (True, (56, 26)), (True, (56, 0)), (True, (56, 2)), (True, (56, 4)), (True, (56, 6)), (True, (56, 8)), (True, (56, 14)), (True, (56, 16)), (True, (56, 18)), (True, (56, 20)), (True, (58, 10)), (True, (58, 12)), (True, (58, 22)), (True, (58, 24)), (True, (58, 26)), (True, (58, 0)), (True, (58, 2)), (True, (58, 4)), (True, (58, 6)), (True, (58, 8)), (True, (58, 14)), (True, (58, 16)), (True, (58, 18)), (True, (58, 20)), (True, (66, 10)), (True, (66, 12)), (True, (66, 22)), (True, (66, 24)), (True, (66, 26)), (True, (66, 0)), (True, (66, 2)), (True, (66, 4)), (True, (66, 6)), (True, (66, 8)), (True, (66, 14)), (True, (66, 16)), (True, (66, 18)), (True, (66, 20)), (True, (68, 10)), (True, (68, 12)), (True, (68, 22)), (True, (68, 24)), (True, (68, 26)), (True, (68, 0)), (True, (68, 2)), (True, (68, 4)), (True, (68, 6)), (True, (68, 8)), (True, (68, 14)), (True, (68, 16)), (True, (68, 18)), (True, (68, 20)), (True, (76, 10)), (True, (76, 12)), (True, (76, 22)), (True, (76, 24)), (True, (76, 26)), (True, (76, 0)), (True, (76, 2)), (True, (76, 4)), (True, (76, 6)), (True, (76, 8)), (True, (76, 14)), (True, (76, 16)), (True, (76, 18)), (True, (76, 20)), (True, (78, 10)), (True, (78, 12)), (True, (78, 22)), (True, (78, 24)), (True, (78, 26)), (True, (78, 0)), (True, (78, 2)), (True, (78, 4)), (True, (78, 6)), (True, (78, 8)), (True, (78, 14)), (True, (78, 16)), (True, (78, 18)), (True, (78, 20)), (True, (80, 10)), (True, (80, 12)), (True, (80, 22)), (True, (80, 24)), (True, (80, 26)), (True, (80, 0)), (True, (80, 2)), (True, (80, 4)), (True, (80, 6)), (True, (80, 8)), (True, (80, 14)), (True, (80, 16)), (True, (80, 18)), (True, (80, 20)), (True, (82, 10)), (True, (82, 12)), (True, (82, 22)), (True, (82, 24)), (True, (82, 26)), (True, (82, 0)), (True, (82, 2)), (True, (82, 4)), (True, (82, 6)), (True, (82, 8)), (True, (82, 14)), (True, (82, 16)), (True, (82, 18)), (True, (82, 20)), (True, (84, 10)), (True, (84, 12)), (True, (84, 22)), (True, (84, 24)), (True, (84, 26)), (True, (84, 0)), (True, (84, 2)), (True, (84, 4)), (True, (84, 6)), (True, (84, 8)), (True, (84, 14)), (True, (84, 16)), (True, (84, 18)), (True, (84, 20)), (True, (86, 10)), (True, (86, 12)), (True, (86, 22)), (True, (86, 24)), (True, (86, 26)), (True, (86, 0)), (True, (86, 2)), (True, (86, 4)), (True, (86, 6)), (True, (86, 8)), (True, (86, 14)), (True, (86, 16)), (True, (86, 18)), (True, (86, 20)), (True, (88, 10)), (True, (88, 12)), (True, (88, 22)), (True, (88, 24)), (True, (88, 26)), (True, (88, 0)), (True, (88, 2)), (True, (88, 4)), (True, (88, 6)), (True, (88, 8)), (True, (88, 14)), (True, (88, 16)), (True, (88, 18)), (True, (88, 20)), (True, (90, 10)), (True, (90, 12)), (True, (90, 22)), (True, (90, 24)), (True, (90, 26)), (True, (90, 0)), (True, (90, 2)), (True, (90, 4)), (True, (90, 6)), (True, (90, 8)), (True, (90, 14)), (True, (90, 16)), (True, (90, 18)), (True, (90, 20)), (True, (92, 10)), (True, (92, 12)), (True, (92, 22)), (True, (92, 24)), (True, (92, 26)), (True, (92, 0)), (True, (92, 2)), (True, (92, 4)), (True, (92, 6)), (True, (92, 8)), (True, (92, 14)), (True, (92, 16)), (True, (92, 18)), (True, (92, 20)), (True, (94, 10)), (True, (94, 12)), (True, (94, 22)), (True, (94, 24)), (True, (94, 26)), (True, (94, 0)), (True, (94, 2)), (True, (94, 4)), (True, (94, 6)), (True, (94, 8)), (True, (94, 14)), (True, (94, 16)), (True, (94, 18)), (True, (94, 20)), (True, (96, 10)), (True, (96, 12)), (True, (96, 22)), (True, (96, 24)), (True, (96, 26)), (True, (96, 0)), (True, (96, 2)), (True, (96, 4)), (True, (96, 6)), (True, (96, 8)), (True, (96, 14)), (True, (96, 16)), (True, (96, 18)), (True, (96, 20)), (True, (98, 10)), (True, (98, 12)), (True, (98, 22)), (True, (98, 24)), (True, (98, 26)), (True, (98, 0)), (True, (98, 2)), (True, (98, 4)), (True, (98, 6)), (True, (98, 8)), (True, (98, 14)), (True, (98, 16)), (True, (98, 18)), (True, (98, 20)), (True, (0, 10)), (True, (0, 12)), (True, (0, 22)), (True, (0, 24)), (True, (0, 26)), (True, (0, 0)), (True, (0, 2)), (True, (0, 4)), (True, (0, 6)), (True, (0, 8)), (True, (0, 14)), (True, (0, 16)), (True, (0, 18)), (True, (0, 20)), (True, (2, 10)), (True, (2, 12)), (True, (2, 22)), (True, (2, 24)), (True, (2, 26)), (True, (2, 0)), (True, (2, 2)), (True, (2, 4)), (True, (2, 6)), (True, (2, 8)), (True, (2, 14)), (True, (2, 16)), (True, (2, 18)), (True, (2, 20)), (True, (4, 10)), (True, (4, 12)), (True, (4, 22)), (True, (4, 24)), (True, (4, 26)), (True, (4, 0)), (True, (4, 2)), (True, (4, 4)), (True, (4, 6)), (True, (4, 8)), (True, (4, 14)), (True, (4, 16)), (True, (4, 18)), (True, (4, 20)), (True, (10, 10)), (True, (10, 12)), (True, (10, 22)), (True, (10, 24)), (True, (10, 26)), (True, (10, 0)), (True, (10, 2)), (True, (10, 4)), (True, (10, 6)), (True, (10, 8)), (True, (10, 14)), (True, (10, 16)), (True, (10, 18)), (True, (10, 20)), (True, (12, 10)), (True, (12, 12)), (True, (12, 22)), (True, (12, 24)), (True, (12, 26)), (True, (12, 0)), (True, (12, 2)), (True, (12, 4)), (True, (12, 6)), (True, (12, 8)), (True, (12, 14)), (True, (12, 16)), (True, (12, 18)), (True, (12, 20)), (True, (14, 10)), (True, (14, 12)), (True, (14, 22)), (True, (14, 24)), (True, (14, 26)), (True, (14, 0)), (True, (14, 2)), (True, (14, 4)), (True, (14, 6)), (True, (14, 8)), (True, (14, 14)), (True, (14, 16)), (True, (14, 18)), (True, (14, 20)), (True, (20, 10)), (True, (20, 12)), (True, (20, 22)), (True, (20, 24)), (True, (20, 26)), (True, (20, 0)), (True, (20, 2)), (True, (20, 4)), (True, (20, 6)), (True, (20, 8)), (True, (20, 14)), (True, (20, 16)), (True, (20, 18)), (True, (20, 20)), (True, (22, 10)), (True, (22, 12)), (True, (22, 22)), (True, (22, 24)), (True, (22, 26)), (True, (22, 0)), (True, (22, 2)), (True, (22, 4)), (True, (22, 6)), (True, (22, 8)), (True, (22, 14)), (True, (22, 16)), (True, (22, 18)), (True, (22, 20)), (True, (24, 10)), (True, (24, 12)), (True, (24, 22)), (True, (24, 24)), (True, (24, 26)), (True, (24, 0)), (True, (24, 2)), (True, (24, 4)), (True, (24, 6)), (True, (24, 8)), (True, (24, 14)), (True, (24, 16)), (True, (24, 18)), (True, (24, 20)), (True, (30, 10)), (True, (30, 12)), (True, (30, 22)), (True, (30, 24)), (True, (30, 26)), (True, (30, 0)), (True, (30, 2)), (True, (30, 4)), (True, (30, 6)), (True, (30, 8)), (True, (30, 14)), (True, (30, 16)), (True, (30, 18)), (True, (30, 20)), (True, (32, 10)), (True, (32, 12)), (True, (32, 22)), (True, (32, 24)), (True, (32, 26)), (True, (32, 0)), (True, (32, 2)), (True, (32, 4)), (True, (32, 6)), (True, (32, 8)), (True, (32, 14)), (True, (32, 16)), (True, (32, 18)), (True, (32, 20)), (True, (34, 10)), (True, (34, 12)), (True, (34, 22)), (True, (34, 24)), (True, (34, 26)), (True, (34, 0)), (True, (34, 2)), (True, (34, 4)), (True, (34, 6)), (True, (34, 8)), (True, (34, 14)), (True, (34, 16)), (True, (34, 18)), (True, (34, 20)), (True, (40, 10)), (True, (40, 12)), (True, (40, 22)), (True, (40, 24)), (True, (40, 26)), (True, (40, 0)), (True, (40, 2)), (True, (40, 4)), (True, (40, 6)), (True, (40, 8)), (True, (40, 14)), (True, (40, 16)), (True, (40, 18)), (True, (40, 20)), (True, (42, 10)), (True, (42, 12)), (True, (42, 22)), (True, (42, 24)), (True, (42, 26)), (True, (42, 0)), (True, (42, 2)), (True, (42, 4)), (True, (42, 6)), (True, (42, 8)), (True, (42, 14)), (True, (42, 16)), (True, (42, 18)), (True, (42, 20)), (True, (44, 10)), (True, (44, 12)), (True, (44, 22)), (True, (44, 24)), (True, (44, 26)), (True, (44, 0)), (True, (44, 2)), (True, (44, 4)), (True, (44, 6)), (True, (44, 8)), (True, (44, 14)), (True, (44, 16)), (True, (44, 18)), (True, (44, 20)), (True, (50, 10)), (True, (50, 12)), (True, (50, 22)), (True, (50, 24)), (True, (50, 26)), (True, (50, 0)), (True, (50, 2)), (True, (50, 4)), (True, (50, 6)), (True, (50, 8)), (True, (50, 14)), (True, (50, 16)), (True, (50, 18)), (True, (50, 20)), (True, (52, 10)), (True, (52, 12)), (True, (52, 22)), (True, (52, 24)), (True, (52, 26)), (True, (52, 0)), (True, (52, 2)), (True, (52, 4)), (True, (52, 6)), (True, (52, 8)), (True, (52, 14)), (True, (52, 16)), (True, (52, 18)), (True, (52, 20)), (True, (54, 10)), (True, (54, 12)), (True, (54, 22)), (True, (54, 24)), (True, (54, 26)), (True, (54, 0)), (True, (54, 2)), (True, (54, 4)), (True, (54, 6)), (True, (54, 8)), (True, (54, 14)), (True, (54, 16)), (True, (54, 18)), (True, (54, 20)), (True, (60, 10)), (True, (60, 12)), (True, (60, 22)), (True, (60, 24)), (True, (60, 26)), (True, (60, 0)), (True, (60, 2)), (True, (60, 4)), (True, (60, 6)), (True, (60, 8)), (True, (60, 14)), (True, (60, 16)), (True, (60, 18)), (True, (60, 20)), (True, (62, 10)), (True, (62, 12)), (True, (62, 22)), (True, (62, 24)), (True, (62, 26)), (True, (62, 0)), (True, (62, 2)), (True, (62, 4)), (True, (62, 6)), (True, (62, 8)), (True, (62, 14)), (True, (62, 16)), (True, (62, 18)), (True, (62, 20)), (True, (64, 10)), (True, (64, 12)), (True, (64, 22)), (True, (64, 24)), (True, (64, 26)), (True, (64, 0)), (True, (64, 2)), (True, (64, 4)), (True, (64, 6)), (True, (64, 8)), (True, (64, 14)), (True, (64, 16)), (True, (64, 18)), (True, (64, 20)), (True, (70, 10)), (True, (70, 12)), (True, (70, 22)), (True, (70, 24)), (True, (70, 26)), (True, (70, 0)), (True, (70, 2)), (True, (70, 4)), (True, (70, 6)), (True, (70, 8)), (True, (70, 14)), (True, (70, 16)), (True, (70, 18)), (True, (70, 20)), (True, (72, 10)), (True, (72, 12)), (True, (72, 22)), (True, (72, 24)), (True, (72, 26)), (True, (72, 0)), (True, (72, 2)), (True, (72, 4)), (True, (72, 6)), (True, (72, 8)), (True, (72, 14)), (True, (72, 16)), (True, (72, 18)), (True, (72, 20)), (True, (74, 10)), (True, (74, 12)), (True, (74, 22)), (True, (74, 24)), (True, (74, 26)), (True, (74, 0)), (True, (74, 2)), (True, (74, 4)), (True, (74, 6)), (True, (74, 8)), (True, (74, 14)), (True, (74, 16)), (True, (74, 18)), (True, (74, 20))] | |
# Cogroup: [(False, (<pyspark.resultiterable.ResultIterable object at 0x7f7e25b4e518>, <pyspark.resultiterable.ResultIterable object at 0x7f7e2279b4e0>)), (True, (<pyspark.resultiterable.ResultIterable object at 0x7f7e2279b4a8>, <pyspark.resultiterable.ResultIterable object at 0x7f7e2279b780>))] | |
# After groupByKey: [[(False, <pyspark.resultiterable.ResultIterable object at 0x7f7e25b4e470>)], [(True, <pyspark.resultiterable.ResultIterable object at 0x7f7e22751278>)], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []] | |
print(rdd.stats()) | |
print(rdd.count()) | |
print(rdd.sum()) | |
print(rdd.mean()) | |
print(rdd.stdev(), rdd.sampleStdev()) | |
print(rdd.variance(), rdd.sampleVariance()) | |
print(rdd.min(), rdd.max()) | |
print(rdd.histogram(5)) | |
# (count: 100, mean: 49.5, stdev: 28.8660700477, max: 99.0, min: 0.0) | |
# 100 | |
# 4950 | |
# 49.5 | |
# 28.8660700477 29.0114919759 | |
# 833.25 841.6666666666666 | |
# 0 99 | |
# ([0.0, 19.8, 39.6, 59.400000000000006, 79.2, 99], [20, 20, 20, 20, 20]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment