Skip to content

Instantly share code, notes, and snippets.

@rainsunny
Last active August 5, 2020 06:32
Show Gist options
  • Save rainsunny/d148b89a876a722a3d70ee2593f023fe to your computer and use it in GitHub Desktop.
Save rainsunny/d148b89a876a722a3d70ee2593f023fe to your computer and use it in GitHub Desktop.
Plot histogram using python 绘制直方图的方法,以及二维直方图(热力图)的绘制方法,用于观察变量的分布情况
# 附带一个用spark将数据取回本地用于绘图的方法
def toArr(df, col, dtype=np.int32):
"""
将DataFrame的一列取回本地,并转成numpy.ndarray格式。
df: 目标DataFrame
col: 目标列名
dtype: 目标列的数据格式
return: 目标列的数据。np.ndarray
"""
d = df.select(col).rdd.flatMap(lambda x: x).collect()
return np.array(d, dtype)
# 获取DataFrame的某一列并转为RDD
def toRdd(df, col):
return df.select(col).rdd.flatMap(lambda x: x)
import matplotlib as mpl
def plot_hist2d(x, y, title="Count dists", xlabel="x count", ylabel="y count", nbins=20,
figsize=(10,8), ranges=[[0,10],[0,10]], cmin=None, cmax=None):
"""
绘制双变量的联合分布的二维直方图(热力图)
x,y: 样本集的两个维度,需要长度一样 np.ndarray
nbins: 划分的bin的数量
ranges: 分别是x,y的range,不在range之内的都被忽略,不会出现在图上
cmin, cmax: 计数的最小、最大阈值,不在阈值范围内的不显示在图上
"""
fig, ax = plt.subplots(figsize=figsize)
# color用了指数规范化,因计数相差悬殊
h, xe, ye, im = ax.hist2d(x, y, bins=nbins, cmap=plt.cm.jet, range=ranges,
norm=mpl.colors.LogNorm(), cmin=cmin, cmax=cmax)
ax.set_title(title)
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
plt.colorbar(im)
plt.show()
# 数据量太大,单机内存放不下的数据,如何绘制分布图?
# => 用RDD
def plot_hist_rdd(rdd, title="Count dist", xlabel="Count", ylabel="Frequency",
density=False, xlog=True, ylog=True, nbins=20):
"""数据量非常大时,用集群计算分布图,不需要将数据都下载到本机内存"""
m, n, l = rd.min(), rd.max(), rd.count()
print("min: %d, max: %d, count: %d" % (m, n, l))
if xlog:
bins = np.insert(np.logspace(0, np.ceil(np.log2(n)), nbins-1, base=2), 0, 0) # 对数坐标时,把0加入作为最小边界
else:
bins = np.linspace(0, n, num=nbins)
print("bins:\n %s" % bins)
full_bins, heights = rd.histogram(bins.tolist())
print("N:\n %s" % heights)
mid_point_bins = full_bins[:-1]
widths = np.subtract(full_bins[1:], full_bins[:-1])
fig, ax = plt.subplots(figsize=(8,6))
ax.bar(mid_point_bins, heights, width=widths, align='edge') # 用这种方式绘出来的矩形图,与直方图完全等价
if xlog:
ax.set_xscale("log")
if ylog:
ax.set_yscale("log")
ax.set_title(title)
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
ax.grid(True)
plt.show()
##### 用来绘制关联网络里面节点度的分布、连通分量大小的分布等
%matplotlib inline
#### import necessary modules
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
#################################
def plot_hist(x, title="Count dist", xlabel="Count", ylabel="Frequency", density=False,
xlog=True, ylog=True, nbins=20):
"""
绘制直方图
横坐标:待统计的数值大小;纵坐标:出现的频率
x: 待绘制的样本集 np.ndarray
title: 图的标题
xlabel, ylabel: x, y坐标轴说明
xlog, ylog: x, y轴是否使用指数坐标
density: 使用频率而不是频数
nbins: 划分的bin的个数
"""
# 打印样本集的最大、最小值,样本数
m, n, l = x.min(), x.max(), len(x)
print("min: %d, max: %d, length: %d" % (m,n,l))
if xlog:
# bins的划分使用了对数坐标,并加入0作为边界,并将bins的边界打印输出
bins = np.insert(np.logspace(0, np.ceil(np.log2(n)), nbins-1, base=2), 0, 0)
else:
bins = np.linepace(0, n, num=nbins)
print("bins:\n %s" % bins)
fig, ax = plt.subplots(figsize=(8,6))
# 将每个bin的频数输出
N, _, _ = ax.hist(x, bins=bins, density=density)
print("N:\n %s" % N)
if xlog:
ax.set_xscale("log")
if ylog:
ax.set_yscale("log")
ax.set_title(title)
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
ax.grid(True)
plt.show()
###############################
def plot_hist2(x, title="Count dist", xlabel="Count", ylabel="Frequency", density=False,
nbins=20, figsize=(10, 12), cumu=False):
"""
连续绘制两张图的方法,便于对比对数坐标和非对数坐标下的数据分布情况
x: 待绘制的样本集 np.ndarray
title: 图的标题
xlabel, ylabel: x, y坐标轴说明
density: 使用频率而不是频数
nbins: 划分的bin的个数
figsize: 图的大小
cumu: 是否绘制累计图
"""
m, n, l = x.min(), x.max(), len(x)
print("min: %d, max: %d, length: %d" % (m,n,l))
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize)
bins = np.insert(np.logspace(0, np.ceil(np.log2(n)), nbins-1, base=2), 0, 0)
print("bins:\n %s" % bins)
N, _, _ = ax1.hist(x, bins=bins)
print("N:\n %s" % N)
ax1.set_xscale("log")
ax1.set_yscale("log")
ax1.set_title(title)
ax1.set_xlabel(xlabel)
ax1.set_ylabel(ylabel)
ax1.grid(True)
ax2.hist(x, bins=bins, density=True, cumulative=cumu)
ax2.set_xscale("log")
ax2.set_xlabel(xlabel)
ax2.set_ylabel(ylabel)
ax2.grid(True)
plt.show()
###########################################
@rainsunny
Copy link
Author

绘制的一个连通分量大小分布的直方图,满足幂律分布。有一个超大的连通子图。

image

@rainsunny
Copy link
Author

双变量分布直方图

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment