rainsunny · August 5, 2020 06:32 · rainsunny · May 22, 2018 · rainsunny · May 22, 2018
diff --git a/helper.py b/helper.py
 # 附带一个用spark将数据取回本地用于绘图的方法
 def toArr(df, col, dtype=np.int32):
    """
    将DataFrame的一列取回本地，并转成numpy.ndarray格式。
    
    df: 目标DataFrame
    col: 目标列名
    dtype: 目标列的数据格式
    return: 目标列的数据。np.ndarray    
    """
    d = df.select(col).rdd.flatMap(lambda x: x).collect()
    return np.array(d, dtype)

 # 获取DataFrame的某一列并转为RDD
 def toRdd(df, col):
    return df.select(col).rdd.flatMap(lambda x: x)
diff --git a/plot_hist_2d.py b/plot_hist_2d.py
 import matplotlib as mpl


 def plot_hist2d(x, y, title="Count dists", xlabel="x count", ylabel="y count", nbins=20,
               figsize=(10,8), ranges=[[0,10],[0,10]], cmin=None, cmax=None):
    """
    绘制双变量的联合分布的二维直方图（热力图）
    
    x,y: 样本集的两个维度，需要长度一样 np.ndarray
    nbins: 划分的bin的数量
    ranges: 分别是x,y的range，不在range之内的都被忽略，不会出现在图上
    cmin, cmax: 计数的最小、最大阈值，不在阈值范围内的不显示在图上
    """    
    fig, ax = plt.subplots(figsize=figsize)
    
    # color用了指数规范化，因计数相差悬殊
    h, xe, ye, im = ax.hist2d(x, y, bins=nbins, cmap=plt.cm.jet, range=ranges,
                              norm=mpl.colors.LogNorm(), cmin=cmin, cmax=cmax)
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    
    plt.colorbar(im)
    plt.show()
    
diff --git a/plot_hist_rdd.py b/plot_hist_rdd.py
 # 数据量太大，单机内存放不下的数据，如何绘制分布图？
 # => 用RDD

 def plot_hist_rdd(rdd, title="Count dist", xlabel="Count", ylabel="Frequency", 
                  density=False, xlog=True, ylog=True, nbins=20):
    """数据量非常大时，用集群计算分布图，不需要将数据都下载到本机内存"""
    
    m, n, l = rd.min(), rd.max(), rd.count()
    print("min: %d, max: %d, count: %d" % (m, n, l))
    
    if xlog:
        bins = np.insert(np.logspace(0, np.ceil(np.log2(n)), nbins-1, base=2), 0, 0) # 对数坐标时，把0加入作为最小边界
    else:
        bins = np.linspace(0, n, num=nbins)
        
    print("bins:\n %s" % bins)
    
    full_bins, heights = rd.histogram(bins.tolist())
    print("N:\n %s" % heights)
    
    mid_point_bins = full_bins[:-1]
    widths = np.subtract(full_bins[1:], full_bins[:-1])
    
    fig, ax = plt.subplots(figsize=(8,6))
    ax.bar(mid_point_bins, heights, width=widths, align='edge') # 用这种方式绘出来的矩形图，与直方图完全等价
    
    if xlog:
        ax.set_xscale("log")
    if ylog:
        ax.set_yscale("log")
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.grid(True)
    plt.show()
diff --git a/plot_histogram.py b/plot_histogram.py
 ##### 用来绘制关联网络里面节点度的分布、连通分量大小的分布等


 %matplotlib inline

 ####  import necessary modules
 import numpy as np
 import pandas as pd
 from matplotlib import pyplot as plt


 ################################# 

 def plot_hist(x, title="Count dist", xlabel="Count", ylabel="Frequency", density=False, 
              xlog=True, ylog=True, nbins=20):
    """
    绘制直方图
    
    横坐标：待统计的数值大小；纵坐标：出现的频率
    
    x: 待绘制的样本集 np.ndarray
    title: 图的标题
    xlabel, ylabel: x, y坐标轴说明
    xlog, ylog: x, y轴是否使用指数坐标
    density: 使用频率而不是频数
    nbins: 划分的bin的个数
    """
    # 打印样本集的最大、最小值，样本数
    m, n, l = x.min(), x.max(), len(x)
    print("min: %d, max: %d, length: %d" % (m,n,l))
    
    if xlog:
        # bins的划分使用了对数坐标，并加入0作为边界，并将bins的边界打印输出
        bins = np.insert(np.logspace(0, np.ceil(np.log2(n)), nbins-1, base=2), 0, 0)
    else:
        bins = np.linepace(0, n, num=nbins)
    print("bins:\n %s" % bins)
    
    fig, ax = plt.subplots(figsize=(8,6))
    
    # 将每个bin的频数输出
    N, _, _ = ax.hist(x, bins=bins, density=density)
    print("N:\n %s" % N)
    
    if xlog:
        ax.set_xscale("log")
    if ylog:
        ax.set_yscale("log")
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.grid(True)
    plt.show()

 ############################### 

 def plot_hist2(x, title="Count dist", xlabel="Count", ylabel="Frequency", density=False, 
               nbins=20, figsize=(10, 12), cumu=False):
    """
    连续绘制两张图的方法，便于对比对数坐标和非对数坐标下的数据分布情况
    
    x: 待绘制的样本集 np.ndarray
    title: 图的标题
    xlabel, ylabel: x, y坐标轴说明
    density: 使用频率而不是频数
    nbins: 划分的bin的个数
    figsize: 图的大小
    cumu: 是否绘制累计图
    """
    m, n, l = x.min(), x.max(), len(x)
    print("min: %d, max: %d, length: %d" % (m,n,l))
    
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize)
    bins = np.insert(np.logspace(0, np.ceil(np.log2(n)), nbins-1, base=2), 0, 0)
    print("bins:\n %s" % bins)
    
    N, _, _ = ax1.hist(x, bins=bins)
    print("N:\n %s" % N)
    
    ax1.set_xscale("log")
    ax1.set_yscale("log")
    ax1.set_title(title)
    ax1.set_xlabel(xlabel)
    ax1.set_ylabel(ylabel)
    ax1.grid(True)
    
    ax2.hist(x, bins=bins, density=True, cumulative=cumu)
    ax2.set_xscale("log")
    ax2.set_xlabel(xlabel)
    ax2.set_ylabel(ylabel)
    ax2.grid(True)

    plt.show()

 ###########################################
	# 附带一个用spark将数据取回本地用于绘图的方法
	def toArr(df, col, dtype=np.int32):
	"""
	将DataFrame的一列取回本地，并转成numpy.ndarray格式。

	df: 目标DataFrame
	col: 目标列名
	dtype: 目标列的数据格式
	return: 目标列的数据。np.ndarray
	"""
	d = df.select(col).rdd.flatMap(lambda x: x).collect()
	return np.array(d, dtype)

	# 获取DataFrame的某一列并转为RDD
	def toRdd(df, col):
	return df.select(col).rdd.flatMap(lambda x: x)
	import matplotlib as mpl


	def plot_hist2d(x, y, title="Count dists", xlabel="x count", ylabel="y count", nbins=20,
	figsize=(10,8), ranges=[[0,10],[0,10]], cmin=None, cmax=None):
	"""
	绘制双变量的联合分布的二维直方图（热力图）

	x,y: 样本集的两个维度，需要长度一样 np.ndarray
	nbins: 划分的bin的数量
	ranges: 分别是x,y的range，不在range之内的都被忽略，不会出现在图上
	cmin, cmax: 计数的最小、最大阈值，不在阈值范围内的不显示在图上
	"""
	fig, ax = plt.subplots(figsize=figsize)

	# color用了指数规范化，因计数相差悬殊
	h, xe, ye, im = ax.hist2d(x, y, bins=nbins, cmap=plt.cm.jet, range=ranges,
	norm=mpl.colors.LogNorm(), cmin=cmin, cmax=cmax)
	ax.set_title(title)
	ax.set_xlabel(xlabel)
	ax.set_ylabel(ylabel)

	plt.colorbar(im)
	plt.show()
	# 数据量太大，单机内存放不下的数据，如何绘制分布图？
	# => 用RDD

	def plot_hist_rdd(rdd, title="Count dist", xlabel="Count", ylabel="Frequency",
	density=False, xlog=True, ylog=True, nbins=20):
	"""数据量非常大时，用集群计算分布图，不需要将数据都下载到本机内存"""

	m, n, l = rd.min(), rd.max(), rd.count()
	print("min: %d, max: %d, count: %d" % (m, n, l))

	if xlog:
	bins = np.insert(np.logspace(0, np.ceil(np.log2(n)), nbins-1, base=2), 0, 0) # 对数坐标时，把0加入作为最小边界
	else:
	bins = np.linspace(0, n, num=nbins)

	print("bins:\n %s" % bins)

	full_bins, heights = rd.histogram(bins.tolist())
	print("N:\n %s" % heights)

	mid_point_bins = full_bins[:-1]
	widths = np.subtract(full_bins[1:], full_bins[:-1])

	fig, ax = plt.subplots(figsize=(8,6))
	ax.bar(mid_point_bins, heights, width=widths, align='edge') # 用这种方式绘出来的矩形图，与直方图完全等价

	if xlog:
	ax.set_xscale("log")
	if ylog:
	ax.set_yscale("log")
	ax.set_title(title)
	ax.set_xlabel(xlabel)
	ax.set_ylabel(ylabel)
	ax.grid(True)
	plt.show()
	##### 用来绘制关联网络里面节点度的分布、连通分量大小的分布等


	%matplotlib inline

	#### import necessary modules
	import numpy as np
	import pandas as pd
	from matplotlib import pyplot as plt


	#################################

	def plot_hist(x, title="Count dist", xlabel="Count", ylabel="Frequency", density=False,
	xlog=True, ylog=True, nbins=20):
	"""
	绘制直方图

	横坐标：待统计的数值大小；纵坐标：出现的频率

	x: 待绘制的样本集 np.ndarray
	title: 图的标题
	xlabel, ylabel: x, y坐标轴说明
	xlog, ylog: x, y轴是否使用指数坐标
	density: 使用频率而不是频数
	nbins: 划分的bin的个数
	"""
	# 打印样本集的最大、最小值，样本数
	m, n, l = x.min(), x.max(), len(x)
	print("min: %d, max: %d, length: %d" % (m,n,l))

	if xlog:
	# bins的划分使用了对数坐标，并加入0作为边界，并将bins的边界打印输出
	bins = np.insert(np.logspace(0, np.ceil(np.log2(n)), nbins-1, base=2), 0, 0)
	else:
	bins = np.linepace(0, n, num=nbins)
	print("bins:\n %s" % bins)

	fig, ax = plt.subplots(figsize=(8,6))

	# 将每个bin的频数输出
	N, _, _ = ax.hist(x, bins=bins, density=density)
	print("N:\n %s" % N)

	if xlog:
	ax.set_xscale("log")
	if ylog:
	ax.set_yscale("log")
	ax.set_title(title)
	ax.set_xlabel(xlabel)
	ax.set_ylabel(ylabel)
	ax.grid(True)
	plt.show()

	###############################

	def plot_hist2(x, title="Count dist", xlabel="Count", ylabel="Frequency", density=False,
	nbins=20, figsize=(10, 12), cumu=False):
	"""
	连续绘制两张图的方法，便于对比对数坐标和非对数坐标下的数据分布情况

	x: 待绘制的样本集 np.ndarray
	title: 图的标题
	xlabel, ylabel: x, y坐标轴说明
	density: 使用频率而不是频数
	nbins: 划分的bin的个数
	figsize: 图的大小
	cumu: 是否绘制累计图
	"""
	m, n, l = x.min(), x.max(), len(x)
	print("min: %d, max: %d, length: %d" % (m,n,l))

	fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize)
	bins = np.insert(np.logspace(0, np.ceil(np.log2(n)), nbins-1, base=2), 0, 0)
	print("bins:\n %s" % bins)

	N, _, _ = ax1.hist(x, bins=bins)
	print("N:\n %s" % N)

	ax1.set_xscale("log")
	ax1.set_yscale("log")
	ax1.set_title(title)
	ax1.set_xlabel(xlabel)
	ax1.set_ylabel(ylabel)
	ax1.grid(True)

	ax2.hist(x, bins=bins, density=True, cumulative=cumu)
	ax2.set_xscale("log")
	ax2.set_xlabel(xlabel)
	ax2.set_ylabel(ylabel)
	ax2.grid(True)

	plt.show()

	###########################################