Last active
August 5, 2020 06:32
-
-
Save rainsunny/d148b89a876a722a3d70ee2593f023fe to your computer and use it in GitHub Desktop.
Plot histogram using python 绘制直方图的方法,以及二维直方图(热力图)的绘制方法,用于观察变量的分布情况
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 附带一个用spark将数据取回本地用于绘图的方法 | |
def toArr(df, col, dtype=np.int32): | |
""" | |
将DataFrame的一列取回本地,并转成numpy.ndarray格式。 | |
df: 目标DataFrame | |
col: 目标列名 | |
dtype: 目标列的数据格式 | |
return: 目标列的数据。np.ndarray | |
""" | |
d = df.select(col).rdd.flatMap(lambda x: x).collect() | |
return np.array(d, dtype) | |
# 获取DataFrame的某一列并转为RDD | |
def toRdd(df, col): | |
return df.select(col).rdd.flatMap(lambda x: x) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib as mpl | |
def plot_hist2d(x, y, title="Count dists", xlabel="x count", ylabel="y count", nbins=20, | |
figsize=(10,8), ranges=[[0,10],[0,10]], cmin=None, cmax=None): | |
""" | |
绘制双变量的联合分布的二维直方图(热力图) | |
x,y: 样本集的两个维度,需要长度一样 np.ndarray | |
nbins: 划分的bin的数量 | |
ranges: 分别是x,y的range,不在range之内的都被忽略,不会出现在图上 | |
cmin, cmax: 计数的最小、最大阈值,不在阈值范围内的不显示在图上 | |
""" | |
fig, ax = plt.subplots(figsize=figsize) | |
# color用了指数规范化,因计数相差悬殊 | |
h, xe, ye, im = ax.hist2d(x, y, bins=nbins, cmap=plt.cm.jet, range=ranges, | |
norm=mpl.colors.LogNorm(), cmin=cmin, cmax=cmax) | |
ax.set_title(title) | |
ax.set_xlabel(xlabel) | |
ax.set_ylabel(ylabel) | |
plt.colorbar(im) | |
plt.show() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 数据量太大,单机内存放不下的数据,如何绘制分布图? | |
# => 用RDD | |
def plot_hist_rdd(rdd, title="Count dist", xlabel="Count", ylabel="Frequency", | |
density=False, xlog=True, ylog=True, nbins=20): | |
"""数据量非常大时,用集群计算分布图,不需要将数据都下载到本机内存""" | |
m, n, l = rd.min(), rd.max(), rd.count() | |
print("min: %d, max: %d, count: %d" % (m, n, l)) | |
if xlog: | |
bins = np.insert(np.logspace(0, np.ceil(np.log2(n)), nbins-1, base=2), 0, 0) # 对数坐标时,把0加入作为最小边界 | |
else: | |
bins = np.linspace(0, n, num=nbins) | |
print("bins:\n %s" % bins) | |
full_bins, heights = rd.histogram(bins.tolist()) | |
print("N:\n %s" % heights) | |
mid_point_bins = full_bins[:-1] | |
widths = np.subtract(full_bins[1:], full_bins[:-1]) | |
fig, ax = plt.subplots(figsize=(8,6)) | |
ax.bar(mid_point_bins, heights, width=widths, align='edge') # 用这种方式绘出来的矩形图,与直方图完全等价 | |
if xlog: | |
ax.set_xscale("log") | |
if ylog: | |
ax.set_yscale("log") | |
ax.set_title(title) | |
ax.set_xlabel(xlabel) | |
ax.set_ylabel(ylabel) | |
ax.grid(True) | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##### 用来绘制关联网络里面节点度的分布、连通分量大小的分布等 | |
%matplotlib inline | |
#### import necessary modules | |
import numpy as np | |
import pandas as pd | |
from matplotlib import pyplot as plt | |
################################# | |
def plot_hist(x, title="Count dist", xlabel="Count", ylabel="Frequency", density=False, | |
xlog=True, ylog=True, nbins=20): | |
""" | |
绘制直方图 | |
横坐标:待统计的数值大小;纵坐标:出现的频率 | |
x: 待绘制的样本集 np.ndarray | |
title: 图的标题 | |
xlabel, ylabel: x, y坐标轴说明 | |
xlog, ylog: x, y轴是否使用指数坐标 | |
density: 使用频率而不是频数 | |
nbins: 划分的bin的个数 | |
""" | |
# 打印样本集的最大、最小值,样本数 | |
m, n, l = x.min(), x.max(), len(x) | |
print("min: %d, max: %d, length: %d" % (m,n,l)) | |
if xlog: | |
# bins的划分使用了对数坐标,并加入0作为边界,并将bins的边界打印输出 | |
bins = np.insert(np.logspace(0, np.ceil(np.log2(n)), nbins-1, base=2), 0, 0) | |
else: | |
bins = np.linepace(0, n, num=nbins) | |
print("bins:\n %s" % bins) | |
fig, ax = plt.subplots(figsize=(8,6)) | |
# 将每个bin的频数输出 | |
N, _, _ = ax.hist(x, bins=bins, density=density) | |
print("N:\n %s" % N) | |
if xlog: | |
ax.set_xscale("log") | |
if ylog: | |
ax.set_yscale("log") | |
ax.set_title(title) | |
ax.set_xlabel(xlabel) | |
ax.set_ylabel(ylabel) | |
ax.grid(True) | |
plt.show() | |
############################### | |
def plot_hist2(x, title="Count dist", xlabel="Count", ylabel="Frequency", density=False, | |
nbins=20, figsize=(10, 12), cumu=False): | |
""" | |
连续绘制两张图的方法,便于对比对数坐标和非对数坐标下的数据分布情况 | |
x: 待绘制的样本集 np.ndarray | |
title: 图的标题 | |
xlabel, ylabel: x, y坐标轴说明 | |
density: 使用频率而不是频数 | |
nbins: 划分的bin的个数 | |
figsize: 图的大小 | |
cumu: 是否绘制累计图 | |
""" | |
m, n, l = x.min(), x.max(), len(x) | |
print("min: %d, max: %d, length: %d" % (m,n,l)) | |
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize) | |
bins = np.insert(np.logspace(0, np.ceil(np.log2(n)), nbins-1, base=2), 0, 0) | |
print("bins:\n %s" % bins) | |
N, _, _ = ax1.hist(x, bins=bins) | |
print("N:\n %s" % N) | |
ax1.set_xscale("log") | |
ax1.set_yscale("log") | |
ax1.set_title(title) | |
ax1.set_xlabel(xlabel) | |
ax1.set_ylabel(ylabel) | |
ax1.grid(True) | |
ax2.hist(x, bins=bins, density=True, cumulative=cumu) | |
ax2.set_xscale("log") | |
ax2.set_xlabel(xlabel) | |
ax2.set_ylabel(ylabel) | |
ax2.grid(True) | |
plt.show() | |
########################################### | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
绘制的一个连通分量大小分布的直方图,满足幂律分布。有一个超大的连通子图。