Skip to content

Instantly share code, notes, and snippets.

@jorenham
Last active July 17, 2025 21:25
Show Gist options
  • Save jorenham/c5bbdb39796ebb951439fede856ac56f to your computer and use it in GitHub Desktop.
Save jorenham/c5bbdb39796ebb951439fede856ac56f to your computer and use it in GitHub Desktop.
scipy usage survey
module usage_count
scipy.stats 1556
scipy.signal 1506
scipy.linalg 1342
scipy.optimize 1319
scipy.sparse 748
scipy.interpolate 497
scipy.special 441
scipy.ndimage 434
scipy.spatial 417
scipy.integrate 383
scipy.io 321
scipy.fftpack 52
scipy.fft 51
scipy.constants 36
scipy.cluster 36
scipy.odr 6
function usage_count
scipy.optimize.minimize 287
scipy.stats.norm 232
scipy.interpolate.interp1d 204
scipy.spatial.distance 192
scipy.sparse.linalg 172
scipy.signal.lfilter 141
scipy.sparse.csr_matrix 134
scipy.optimize.curve_fit 133
scipy.integrate.solve_ivp 109
scipy.stats.pearsonr 89
scipy.io.wavfile 88
scipy.integrate.quad 86
scipy.io.loadmat 85
scipy.signal.find_peaks 83
scipy.linalg.sqrtm 79
scipy.linalg.eigh 79
scipy.signal.butter 78
scipy.linalg.norm 75
scipy.signal.savgol_filter 69
scipy.linalg.svd 67
scipy.signal.filtfilt 66
scipy.linalg.block_diag 65
scipy.integrate.odeint 65
scipy.ndimage.gaussian_filter 65
scipy.special.gamma 62
scipy.signal.convolve2d 60
scipy.signal.resample 57
scipy.linalg.inv 51
scipy.stats.spearmanr 50
scipy.linalg.expm 50
scipy.linalg.solve 50
scipy.signal.medfilt 49
scipy.special.softmax 48
scipy.optimize.fmin 47
scipy.sparse.coo_matrix 47
scipy.optimize.fsolve 45
scipy.ndimage.gaussian_filter1d 45
scipy.spatial.cKDTree 45
scipy.linalg.cho_solve 44
scipy.interpolate.CubicSpline 44
scipy.signal.windows 42
scipy.optimize.leastsq 41
scipy.interpolate.griddata 41
scipy.signal.fftconvolve 40
scipy.stats.multivariate_normal 39
scipy.sparse.lil_matrix 39
scipy.signal.firwin 39
scipy.optimize.linear_sum_assignment 38
scipy.sparse.csc_matrix 37
scipy.optimize.least_squares 37
scipy.spatial.KDTree 37
scipy.optimize.root 37
scipy.spatial.ConvexHull 36
scipy.stats.binom 35
scipy.stats.entropy 35
scipy.linalg.qr 35
scipy.linalg.eig 32
scipy.special.expit 32
scipy.signal.welch 31
scipy.ndimage.zoom 31
scipy.optimize.fmin_l_bfgs_b 31
scipy.fft.fft 31
scipy.stats.linregress 30
scipy.stats.sem 30
scipy.sparse.issparse 30
scipy.stats.rv_continuous 30
scipy.sparse.diags 29
scipy.optimize.brentq 29
scipy.stats.chi2 28
scipy.special.erf 28
scipy.linalg.cholesky 28
scipy.linalg.cho_factor 28
scipy.sparse.csgraph 28
scipy.spatial.Delaunay 28
scipy.stats.ttest_ind 27
scipy.signal.hilbert 27
scipy.special.comb 26
scipy.linalg.solve_triangular 26
scipy.interpolate.UnivariateSpline 26
scipy.stats.rankdata 25
scipy.stats.kendalltau 25
scipy.ndimage.filters 25
scipy.special.logsumexp 25
scipy.stats.gamma 25
scipy.linalg.lstsq 25
scipy.linalg.toeplitz 24
scipy.signal.stft 24
scipy.ndimage.binary_dilation 24
scipy.stats.poisson 23
scipy.stats.beta 23
scipy.special.gammaln 23
scipy.linalg.logm 23
scipy.signal.freqz 23
scipy.interpolate.RegularGridInterpolator 23
scipy.signal.convolve 23
scipy.stats.zscore 22
scipy.io.savemat 22
scipy.ndimage.label 22
scipy.linalg.blas 22
scipy.signal.argrelextrema 22
scipy.optimize.minimize_scalar 21
scipy.spatial.transform 20
scipy.integrate.simps 20
scipy.signal.istft 20
scipy.sparse.hstack 20
scipy.cluster.hierarchy 19
scipy.stats.gaussian_kde 19
scipy.interpolate.splev 19
scipy.stats.mode 18
scipy.stats.skew 18
scipy.ndimage.median_filter 18
scipy.optimize.newton 18
scipy.fftpack.fft 18
scipy.optimize.OptimizeResult 18
scipy.ndimage.interpolation 18
scipy.stats.t 17
scipy.ndimage.generic_filter 17
scipy.stats.mstats 17
scipy.linalg.lu 17
scipy.optimize.bisect 17
scipy.ndimage.distance_transform_edt 17
scipy.stats.kurtosis 16
scipy.stats.bernoulli 16
scipy.special.psi 16
scipy.linalg.pinv 16
scipy.optimize.root_scalar 16
scipy.sparse.identity 16
scipy.sparse.eye 16
scipy.signal.get_window 16
scipy.optimize.linprog 16
scipy.fft.fftfreq 16
scipy.ndimage.convolve 16
scipy.stats.truncnorm 15
scipy.optimize.differential_evolution 15
scipy.interpolate.RectBivariateSpline 15
scipy.stats.ttest_rel 14
scipy.signal.decimate 14
scipy.stats.fisher_exact 14
scipy.linalg.fractional_matrix_power 14
scipy.linalg.null_space 14
scipy.signal.medfilt2d 14
scipy.signal.correlate2d 14
scipy.sparse.csr 14
scipy.sparse.spdiags 13
scipy.linalg.lapack 13
scipy.interpolate.interp2d 13
scipy.special.sph_harm 13
scipy.interpolate.Akima1DInterpolator 13
scipy.signal.correlate 13
scipy.optimize.rosen 13
scipy.special.binom 12
scipy.stats.stats 12
scipy.ndimage.morphology 12
scipy.linalg.lu_solve 12
scipy.interpolate.InterpolatedUnivariateSpline 12
scipy.stats.lognorm 12
scipy.signal.hann 12
scipy.stats.uniform 11
scipy.interpolate.splrep 11
scipy.linalg.hadamard 11
scipy.ndimage.rotate 11
scipy.misc.imread 11
scipy.integrate.ode 11
scipy.integrate.simpson 11
scipy.spatial.Voronoi 11
scipy.ndimage.binary_erosion 11
scipy.stats.chi2_contingency 10
scipy.linalg.lu_factor 10
scipy.linalg.solve_discrete_are 10
scipy.interpolate.BSpline 10
scipy.optimize.nnls 10
scipy.ndimage.convolve1d 10
scipy.optimize.Bounds 10
scipy.interpolate.Rbf 10
scipy.interpolate.splprep 10
scipy.signal.sosfilt 10
scipy.sparse.vstack 10
scipy.spatial.distance_matrix 10
scipy.special.factorial 10
scipy.optimize.rosen_der 10
scipy.stats.rv_discrete 9
scipy.ndimage.uniform_filter1d 9
scipy.sparse.kron 9
scipy.cluster.vq 9
scipy.interpolate.make_interp_spline 9
scipy.signal.peak_widths 9
scipy.optimize.basinhopping 9
scipy.fft.ifft 9
scipy.signal.resample_poly 9
scipy.stats.wilcoxon 9
scipy.ndimage.imread 9
scipy.misc.imsave 9
scipy.stats.f 8
scipy.stats.mannwhitneyu 8
scipy.linalg.circulant 8
scipy.integrate.cumtrapz 8
scipy.linalg.dft 8
scipy.linalg.eigvals 8
scipy.ndimage.maximum_filter 8
scipy.optimize.approx_fprime 8
scipy.integrate.dblquad 8
scipy.stats.distributions 8
scipy.special.loggamma 8
scipy.interpolate.LinearNDInterpolator 8
scipy.signal.gaussian 8
scipy.signal.detrend 8
scipy.signal.sosfiltfilt 8
scipy.signal.hamming 8
scipy.sparse.block_diag 8
scipy.signal.signaltools 8
scipy.stats.nbinom 7
scipy.signal.argrelmax 7
scipy.stats.chisquare 7
scipy.special.kv 7
scipy.special.lambertw 7
scipy.linalg.det 7
scipy.integrate.cumulative_trapezoid 7
scipy.optimize.fmin_bfgs 7
scipy.integrate.solve_bvp 7
scipy.stats.rv_histogram 7
scipy.ndimage.binary_fill_holes 7
scipy.special.digamma 7
scipy.stats.probplot 7
scipy.stats.scoreatpercentile 7
scipy.stats.shapiro 6
scipy.stats.ortho_group 6
scipy.linalg.schur 6
scipy.linalg.eigvalsh 6
scipy.linalg.sinm 6
scipy.linalg.cosm 6
scipy.linalg.rq 6
scipy.linalg.LinAlgError 6
scipy.linalg.solve_banded 6
scipy.optimize.NonlinearConstraint 6
scipy.signal.hanning 6
scipy.fftpack.dct 6
scipy.fftpack.ifft 6
scipy.optimize.fmin_tnc 6
scipy.interpolate.PchipInterpolator 6
scipy.signal.iirfilter 6
scipy.signal.spectrogram 6
scipy.stats.iqr 6
scipy.sparse.dok_matrix 6
scipy.sparse.save_npz 6
scipy.special.legendre 6
scipy.optimize.nonlin 6
scipy.stats.pareto 6
scipy.linalg.solve_continuous_are 6
scipy.signal.coherence 6
scipy.stats.expon 5
scipy.linalg.pinv2 5
scipy.optimize.fmin_cobyla 5
scipy.linalg.orthogonal_procrustes 5
scipy.signal.kaiserord 5
scipy.optimize.dual_annealing 5
scipy.special.jv 5
scipy.integrate.trapz 5
scipy.constants.c 5
scipy.special.spherical_jn 5
scipy.fftpack.fftshift 5
scipy.special.hyp2f1 5
scipy.interpolate.NearestNDInterpolator 5
scipy.interpolate.lagrange 5
scipy.spatial.procrustes 5
scipy.stats.kde 5
scipy.signal.spectral 5
scipy.signal.blackman 5
scipy.signal.peak_prominences 5
scipy.fft.fftshift 5
scipy.sparse.load_npz 5
scipy.sparse.isspmatrix 5
scipy.spatial.voronoi_plot_2d 5
scipy.ndimage.map_coordinates 5
scipy.ndimage.center_of_mass 5
scipy.special.logit 5
scipy.special.jn 5
scipy.signal.chirp 5
scipy.ndimage.shift 5
scipy.stats.binned_statistic_2d 5
scipy.signal.cheby1 5
scipy.optimize.check_grad 5
scipy.stats.loguniform 4
scipy.stats.qmc 4
scipy.stats.multinomial 4
scipy.stats.gmean 4
scipy.ndimage.measurements 4
scipy.stats.wasserstein_distance 4
scipy.stats.trim_mean 4
scipy.optimize.fminbound 4
scipy.linalg.orth 4
scipy.fftpack.fft2 4
scipy.fftpack.ifft2 4
scipy.sparse.random 4
scipy.constants.epsilon_0 4
scipy.ndimage.laplace 4
scipy.ndimage.minimum_filter 4
scipy.ndimage.sobel 4
scipy.optimize.optimize 4
scipy.signal.lombscargle 4
scipy.optimize._numdiff 4
scipy.constants.physical_constants 4
scipy.fftpack.rfft 4
scipy.integrate.trapezoid 4
scipy.interpolate.CloughTocher2DInterpolator 4
scipy.io.matlab 4
scipy.special.eval_legendre 4
scipy.stats.chisqprob 4
scipy.special.xlogy 4
scipy.ndimage.uniform_filter 4
scipy.signal.wiener 4
scipy.signal.triang 4
scipy.signal.lfilter_zi 4
scipy.sparse.data 4
scipy.sparse.csc 4
scipy.sparse.bmat 4
scipy.sparse.base 4
scipy.spatial.qhull 4
scipy.linalg.pinvh 4
scipy.linalg.solve_sylvester 4
scipy.linalg.cython_blas 4
scipy.sparse.spmatrix 4
scipy.stats.invgamma 4
scipy.stats.ttest_1samp 4
scipy.stats.gumbel_r 4
scipy.linalg.gr 4
scipy.stats.randint 3
scipy.stats.ks_2samp 3
scipy.misc.logsumexp 3
scipy.stats.boxcox 3
scipy.special.inv_boxcox 3
scipy.stats.betabinom 3
scipy.stats.ncx2 3
scipy.stats.binom_test 3
scipy.stats.kstest 3
scipy.signal.argrelmin 3
scipy.special.boxcox1p 3
scipy.io.mmread 3
scipy.stats.ranksums 3
scipy.linalg.expm_frechet 3
scipy.special.erfc 3
scipy.special.hermite 3
scipy.signal.kaiser_beta 3
scipy.stats.binned_statistic 3
scipy.optimize.toms748 3
scipy.optimize.fmin_powell 3
scipy.special.lpmn 3
scipy.constants.Avogadro 3
scipy.fftpack.fftfreq 3
scipy.misc.derivative 3
scipy.special.kve 3
scipy.signal.iirdesign 3
scipy.signal.blackmanharris 3
scipy.signal.place_poles 3
scipy.signal.iirnotch 3
scipy.sparse.coo_array 3
scipy.io.arff 3
scipy.special.gammainc 3
scipy.special.rel_entr 3
scipy.signal.upfirdn 3
scipy.stats.vonmises 3
scipy.linalg.svdvals 3
scipy.sparse.extract 3
scipy.optimize.broyden1 3
scipy.optimize.lbfgsb 3
scipy.fftpack.realtransforms 3
scipy.signal.cosine 3
scipy.signal.tukey 3
scipy.signal.waveforms 3
scipy.stats.tstd 3
scipy.linalg.tanhm 3
scipy.linalg.sinhm 3
scipy.linalg.coshm 3
scipy.interpolate.interpn 3
scipy.optimize.fmin_cg 3
scipy.optimise.minimize 3
scipy.signal.sawtooth 3
scipy.constants.pi 2
scipy.stats.f_oneway 2
scipy.stats.ecdf 2
scipy.stats.logistic 2
scipy.special.polygamma 2
scipy.special.eval_hermitenorm 2
scipy.linalg.ishermitian 2
scipy.constants.e 2
scipy.constants.m_p 2
scipy.signal.cont2discrete 2
scipy.special.sinc 2
scipy.signal.kaiser_atten 2
scipy.signal.lsim 2
scipy.signal.lti 2
scipy.linalg.eigh_tridiagonal 2
scipy.linalg.interpolative 2
scipy.linalg.polar 2
scipy.interpolate.RBFInterpolator 2
scipy.optimize.LinearConstraint 2
scipy.optimize.shgo 2
scipy.stats.signaltonoise 2
scipy.signal.kaiser 2
scipy.fftpack.idct 2
scipy.integrate.quad_vec 2
scipy.integrate.RK45 2
scipy.integrate.nquad 2
scipy.integrate.cumulative_simpson 2
scipy.integrate.fixed_quad 2
scipy.special.beta 2
scipy.special.kn 2
scipy.special.i0 2
scipy.signal.correlation_lags 2
scipy.ndimage.find_objects 2
scipy.signal.zpk2tf 2
scipy.signal.find_peaks_cwt 2
scipy.signal.filter_design 2
scipy.signal.square 2
scipy.stats.circmean 2
scipy.fftpack.irfft 2
scipy.fftpack.dst 2
scipy.linalg.funm 2
scipy.sparse.csc_array 2
scipy.sparse.csr_array 2
scipy.sparse.rand 2
scipy.spatial.geometric_slerp 2
scipy.spatial.convex_hull_plot_2d 2
scipy.ndimage.affine_transform 2
scipy.signal.lfiltic 2
scipy.special.boxcox 2
scipy.special.erfinv 2
scipy.special.perm 2
scipy.special.betainc 2
scipy.special.zeta 2
scipy.special.roots_legendre 2
scipy.signal.spline_filter 2
scipy.stats.percentileofscore 2
scipy.sparse.random_array 2
scipy.stats.bootstrap 2
scipy.optimize.fmin_ncg 2
scipy.sparse.dia_matrix 2
scipy.linalg.basic 2
scipy.ndimage.sum 2
scipy.linalg.get_lapack_funcs 2
scipy.optimize.lsq_linear 2
scipy.signal.bspline 2
scipy.linalg.solve_toeplitz 2
scipy.signal.cheb1ord 2
scipy.signal.bartlett 2
scipy.signal.periodogram 2
scipy.signal.boxcar 2
scipy.signal.flattop 2
scipy.signal.nuttall 2
scipy.signal.parzen 2
scipy.ndimage.correlate 2
scipy.ndimage.percentile_filter 2
scipy.ndimage.rank_filter 2
scipy.special.erfcinv 2
scipy.stats.variation 2
scipy.stats.contingency 2
scipy.stats.kruskal 2
scipy.state.norm 2
scipy.fftpack.ifftn 2
scipy.linalg.u_factor 2
scipy.linalg.cython_lapack 2
scipy.fft.fft2 2
scipy.fft.ifft2 2
scipy.special.multigammaln 2
scipy.io.harwell_boeing 2
scipy.optimize.broyden2 2
scipy.optimize.newton_krylov 2
scipy.special.chdtri 2
scipy.optimize.fmin_slsqp 2
scipy.special.expm1 1
scipy.stats.invgauss 1
scipy.stats.laplace 1
scipy.stats.geom 1
scipy.stats.unitary_group 1
scipy.stats.binomtest 1
scipy.stats.ConstantInputWarning 1
scipy.stats.weibull_min 1
scipy.stats.itemfreq 1
scipy.stats.describe 1
scipy.stats.hypergeom 1
scipy.stats.hmean 1
scipy.linalg.LinAlgWarning 1
scipy.linalg.convolution_matrix 1
scipy.linalg.kron 1
scipy.constants.hbar_si 1
scipy.constants.m_e 1
scipy.constants.elementary_charge 1
scipy.linalg.expm3 1
scipy.linalg.expm2 1
scipy.linalg.hankel 1
scipy.signal.remez 1
scipy.signal.minimum_phase 1
scipy.signal.group_delay 1
scipy.signal.bessel 1
scipy.linalg.eigvalsh_tridiagonal 1
scipy.ndimage.prewitt 1
scipy.linalg.solveh_banded 1
scipy.stats.levy 1
scipy.special.jvp 1
scipy.constants.k 1
scipy.constants.mu_0 1
scipy.constants.hbar 1
scipy.integrate.RK23 1
scipy.integrate.OdeSolution 1
scipy.integrate.quadrature 1
scipy.io.readsav 1
scipy.LowLevelCallable.html 1
scipy.special.ellipkinc 1
scipy.interpolate.PPoly 1
scipy.interpolate.interpolate 1
scipy.sparse.numpy 1
scipy.interpolate.LSQUnivariateSpline 1
scipy.interpolate.CubicHermiteSpline 1
scipy.stats.studentized_range 1
scipy.special._ufuncs 1
scipy.stats._boost 1
scipy.special.k1 1
scipy.special.iv 1
scipy.special.p_roots 1
scipy.interpolate.RegularGridData 1
scipy.constants.g 1
scipy.interpolate.make_lsq_spline 1
scipy.fftpack.ifftshift 1
scipy.signal.general_gaussian 1
scipy.fft.rfft 1
scipy.fft.rfftfreq 1
scipy.fft.irfft 1
scipy.signal.cwt 1
scipy.signal.ricker 1
scipy.signal.unit_impulse 1
scipy.signal.TransferFunction 1
scipy.signal.buttord 1
scipy.signal.bb 1
scipy.sparse.tril 1
scipy.signal.chebwin 1
scipy.sparse.diags_array 1
scipy.sparse.bsr_array 1
scipy.sparse.lil_array 1
scipy.sparse.isspmatrix_csr 1
scipy.sparse.isspmatrix_csc 1
scipy.linalg.ldl 1
scipy.sprase.hstack 1
scipy.sprase.vstack 1
scipy.interpolate.interpnd 1
scipy.constants.G 1
scipy.odr.Data 1
scipy.odr.ODR 1
scipy.odr.unilinear 1
scipy.spatial.HalfspaceIntersection 1
scipy.spatial.QhullError 1
scipy.ndimage.binary_closing 1
scipy.ndimage.binary_opening 1
scipy.special.cbrt 1
scipy.special.jn_zeros 1
scipy.special.airy 1
scipy.special.softplus 1
scipy.special.assoc_laguerre 1
scipy.special.kl_div 1
scipy.special.zetac 1
scipy.special.yn 1
scipy.special.laguerre 1
scipy.special.chebyt 1
scipy.stats.circstd 1
scipy.special.lpmv 1
scipy.special.spherical_yn 1
scipy.special.gammaincc 1
scipy.stats.crystalball 1
scipy.misc.factorial2 1
scipy.special.factorial2 1
scipy.signal.sosfilt_zi 1
scipy.linalg.companion 1
scipy.signal.normalize 1
scipy.special.spherical_in 1
scipy.special.spherical_kn 1
scipy.special.ndtri 1
scipy.special.roots_laguerre 1
scipy.linalg.diagsvd 1
scipy.constants.lambda2nu 1
scipy.constants.speed_of_light 1
scipy.stats.cauchy 1
scipy.ndimage.mean 1
scipy.special.basic 1
scipy.sparse.sparsetools 1
scipy.io.FortranFile 1
scipy.constants.convert_temperature 1
scipy.linalg.expm_cond 1
scipy.interpolate.SmoothSphereBivariateSpline 1
scipy.optimize.rosen_hess 1
scipy.special.eval_gegenbauer 1
scipy.ndimage.gaussian_laplace 1
scipy.integrate.root_scalar 1
scipy.integrate.minimize_scalar 1
scipy.signal.tf2sos 1
scipy.signal.zpk2sos 1
scipy.signal.firwin2 1
scipy.signal.oaconvolve 1
scipy.signal.max_len_seq 1
scipy.stats.sigmaclip 1
scipy.special.j1 1
scipy.windows.boxcar 1
scipy.windows.parzen 1
scipy.stats.frechet_l 1
scipy.stats.frechet_r 1
scipy.stats.reciprocal 1
scipy.misc.factorial 1
scipy.stats.genextreme 1
scipy.stats.dirichlet 1
scipy.stats.ttest_ind_from_stats 1
scipy.special.entr 1
scipy.stats.normaltest 1
scipy.integrate.vode 1
scipy.integrate.lsoda 1
scipy.linalg.matmul_toeplitz 1
scipy.linalg.decomp_schur 1
scipy.linalg.get_blas_funcs 1
scipy.integrate.romb 1
scipy.linalg.qr_multiply 1
scipy.linalg.tanm 1
scipy.linalg.signm 1
scipy.stats.wishart 1
scipy.stats.invwishart 1
scipy.sparse.dia_array 1
scipy.linalg.qz 1
scipy.linalg.cholesky_banded 1
scipy.optimize.ridder 1
scipy.optimize.brent 1
scipy.optimize.OptimizeWarning 1
scipy.sparse.construct 1
scipy.stats.anderson_ksamp 1
scipy.optimize.brute 1
scipy.optimize.Constraint 1
scipy.optimize.SR1 1
scipy.signal.ShortTimeFFT 1
scipy.ndimage.maximum_filter1d 1
scipy.signal.firls 1
scipy.signal.baxcar 1
scipy.signal.tri 1
scipy.signal.sigtools 1
scipy.signal.dlti 1
scipy.signal.csd 1
scipy.signal.gauss_spline 1
scipy.signal.qspline1d 1
scipy.signal.cspline1d 1
scipy.signal.qspline1d_eval 1
scipy.signal.cspline1d_eval 1
scipy.signal.qspline2d 1
scipy.signal.cspline2d 1
scipy.LowLevelCallable 1
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "pandas",
# "requests",
# ]
# ///
"""
GitHub SciPy Usage Analysis Script using GraphQL API
This script uses GitHub's GraphQL API to analyze scipy usage patterns from GitHub repositories.
It searches for specific import patterns and function usage across repositories.
Setup:
1. Create a GitHub Personal Access Token with 'public_repo' scope
2. Set GITHUB_TOKEN environment variable or update the script
3. Run with `uv run --env-file <ENV_FILE> <THIS_SCRIPT>`
Disclaimer: This was almost completely vibe-coded.
"""
# ruff: noqa: T201
import base64
import json
import logging
import os
import re
import time
from collections import Counter
from datetime import datetime
from typing import Any
import pandas as pd
import requests
# Configure logging
logging.basicConfig(
level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
class GitHubSciPyAnalyzer:
results: dict[str, Any]
rate_limit_reset: int | None
def __init__(self, github_token: str) -> None:
"""
Initialize the GitHub SciPy usage analyzer.
Args:
github_token: GitHub Personal Access Token
"""
self.github_token = github_token
self.api_url = "https://api.github.com/graphql"
self.headers = {
"Authorization": f"Bearer {github_token}",
"Content-Type": "application/json",
}
# Rate limiting
self.requests_made = 0
self.rate_limit_remaining = 5000
self.rate_limit_reset = None
# Define scipy modules and search patterns
self.scipy_modules = [
"stats",
"linalg",
"optimize",
"integrate",
"interpolate",
"signal",
"sparse",
"spatial",
"special",
"ndimage",
"io",
"fft",
"cluster",
"constants",
"misc",
"odr",
]
# Results storage
self.results = {
"module_usage": Counter(),
"function_usage": Counter(),
"import_patterns": Counter(),
"repositories": set(),
"files_analyzed": 0,
}
def wait_for_rate_limit(self) -> None:
"""Handle rate limiting by waiting if necessary."""
logger.debug(
"Rate limit check: remaining=%d, reset=%s",
self.rate_limit_remaining,
self.rate_limit_reset,
)
# Check if we're actually close to the rate limit
if self.rate_limit_remaining <= 10:
if self.rate_limit_reset:
current_time = time.time()
wait_time = max(0, self.rate_limit_reset - current_time) + 1
logger.info(
"Rate limit low (remaining: %d), waiting %d seconds...",
self.rate_limit_remaining,
round(wait_time, 2),
)
time.sleep(wait_time)
# Reset rate limit info after waiting
self.rate_limit_remaining = 5000
else:
# If no reset time, wait a default amount
logger.info("Rate limit low but no reset time, waiting 60 seconds...")
time.sleep(60)
self.rate_limit_remaining = 5000
def make_graphql_request(
self, query: str, variables: dict[str, Any] | None = None
) -> dict[str, Any] | None:
"""
Make a GraphQL request to GitHub API.
Args:
query: GraphQL query string
variables: Query variables
Returns:
Response data
"""
self.wait_for_rate_limit()
payload: dict[str, Any] = {"query": query}
if variables:
payload["variables"] = variables
response = requests.post(self.api_url, json=payload, headers=self.headers)
self.requests_made += 1
# Update rate limit info
self.rate_limit_remaining = int(
response.headers.get("X-RateLimit-Remaining", 0)
)
reset_time = response.headers.get("X-RateLimit-Reset")
if reset_time:
self.rate_limit_reset = int(reset_time)
if response.status_code != 200:
logger.error("GraphQL request failed: %d", response.status_code)
logger.error("%s", response.text)
return None
data: dict[str, Any] = response.json()
if "errors" in data:
logger.error("GraphQL errors: %s", data["errors"])
return None
return data
def search_code_files(
self, search_query: str, max_results: int = 100
) -> list[dict[str, Any]]:
"""
Search for code files using GitHub's REST API.
Args:
search_query: Search query string
max_results: Maximum number of results to return
Returns:
List of file information with content
"""
results: list[dict[str, Any]] = []
page = 1
per_page = min(100, max_results)
while len(results) < max_results:
# Use REST API for code search
url = "https://api.github.com/search/code"
headers = {
"Authorization": f"Bearer {self.github_token}",
"Accept": "application/vnd.github.v3+json",
}
params = {"q": search_query, "page": page, "per_page": per_page}
self.wait_for_rate_limit()
response = requests.get(url, headers=headers, params=params)
self.requests_made += 1
# Update rate limit info
self.rate_limit_remaining = int(
response.headers.get("X-RateLimit-Remaining", 5000)
)
reset_time = response.headers.get("X-RateLimit-Reset")
if reset_time:
self.rate_limit_reset = int(reset_time)
logger.debug(
"After request: remaining=%d, reset=%s, status=%d",
self.rate_limit_remaining,
self.rate_limit_reset,
response.status_code,
)
if response.status_code != 200:
logger.error("REST API request failed: %d", response.status_code)
logger.error("%s", response.text)
if response.status_code == 403:
logger.error("Rate limit exceeded or forbidden")
# Wait longer for rate limit reset
if self.rate_limit_reset:
wait_time = max(0, self.rate_limit_reset - time.time()) + 1
logger.info(
"Waiting %d seconds for rate limit reset...",
round(wait_time, 2),
)
time.sleep(wait_time)
self.rate_limit_remaining = 5000
continue
break
data = response.json()
for item in data.get("items", []):
if len(results) >= max_results:
break
# Get file content using raw URL
content_url = item.get("git_url", "")
if content_url:
content = self.get_file_content(content_url)
if content:
results.append({
"repo_name": item["repository"]["full_name"],
"content": content,
"fragment": item.get("text_matches", [{}])[0].get(
"fragment", ""
),
})
# Check if there are more pages
if len(data.get("items", [])) < per_page:
break
page += 1
return results[:max_results]
def get_file_content(self, git_url: str) -> str | None:
"""
Get file content from GitHub's git API.
Args:
git_url: Git URL for the file
Returns:
File content or None if failed
"""
self.wait_for_rate_limit()
headers = {
"Authorization": f"Bearer {self.github_token}",
"Accept": "application/vnd.github.v3+json",
}
response = requests.get(git_url, headers=headers)
self.requests_made += 1
# Update rate limit info
self.rate_limit_remaining = int(
response.headers.get("X-RateLimit-Remaining", 5000)
)
reset_time = response.headers.get("X-RateLimit-Reset")
if reset_time:
self.rate_limit_reset = int(reset_time)
if response.status_code != 200:
return None
data = response.json()
# Content is base64 encoded
try:
content = base64.b64decode(data["content"]).decode("utf-8")
return content
except Exception:
return None
def search_scipy_patterns(self, max_files_per_pattern: int = 200) -> dict[str, Any]:
"""
Search for various scipy usage patterns.
Args:
max_files_per_pattern: Maximum files to analyze per pattern
Returns:
Dictionary of search results by pattern
"""
search_patterns = [
# Import patterns
'language:python "from scipy.stats import"',
'language:python "from scipy.linalg import"',
'language:python "from scipy.optimize import"',
'language:python "from scipy.integrate import"',
'language:python "from scipy.interpolate import"',
'language:python "from scipy.signal import"',
'language:python "from scipy.sparse import"',
'language:python "from scipy.spatial import"',
'language:python "from scipy.special import"',
'language:python "from scipy.ndimage import"',
# Module imports
'language:python "import scipy.stats"',
'language:python "import scipy.linalg"',
'language:python "import scipy.optimize"',
'language:python "import scipy.signal"',
# Function calls
'language:python "scipy.stats."',
'language:python "scipy.linalg."',
'language:python "scipy.optimize."',
'language:python "scipy.signal."',
]
all_results: dict[str, Any] = {}
for i, pattern in enumerate(search_patterns):
logger.info(
"Searching pattern %d/%d: %s", i + 1, len(search_patterns), pattern
)
try:
results = self.search_code_files(pattern, max_files_per_pattern)
all_results[pattern] = results
logger.info("Found %d files for pattern: %s", len(results), pattern)
# Small delay to be respectful to the API
time.sleep(1)
except Exception as e:
logger.exception("Error searching pattern '%s': %s", pattern, e)
continue
return all_results
def analyze_file_content(self, content: str, repo_name: str) -> None:
"""
Analyze a single file's content for scipy usage.
Args:
content: File content
repo_name: Repository name
"""
if not content:
return
self.results["files_analyzed"] += 1
self.results["repositories"].add(repo_name)
# Patterns to match
patterns = {
"from_imports": r"from\s+scipy\.(\w+)\s+import\s+([^#\n]+)",
"module_imports": r"import\s+scipy\.(\w+)(?:\s+as\s+\w+)?",
"function_calls": r"scipy\.(\w+)\.(\w+)",
"direct_scipy_import": r"from\s+scipy\s+import\s+([^#\n]+)",
}
# Analyze from imports
for match in re.finditer(patterns["from_imports"], content):
module = match.group(1)
imports = match.group(2)
self.results["module_usage"][f"scipy.{module}"] += 1
self.results["import_patterns"][f"from scipy.{module} import"] += 1
# Parse individual functions
for func in imports.split(","):
func = func.strip().split(" as ")[0].strip()
if func and func != "*" and func.isidentifier():
self.results["function_usage"][f"scipy.{module}.{func}"] += 1
# Analyze module imports
for match in re.finditer(patterns["module_imports"], content):
module = match.group(1)
self.results["module_usage"][f"scipy.{module}"] += 1
self.results["import_patterns"][f"import scipy.{module}"] += 1
# Analyze function calls
for match in re.finditer(patterns["function_calls"], content):
module = match.group(1)
function = match.group(2)
self.results["module_usage"][f"scipy.{module}"] += 1
self.results["function_usage"][f"scipy.{module}.{function}"] += 1
# Analyze direct scipy imports
for match in re.finditer(patterns["direct_scipy_import"], content):
imports = match.group(1)
self.results["import_patterns"]["from scipy import"] += 1
for module in imports.split(","):
module = module.strip().split(" as ")[0].strip()
if module and module in self.scipy_modules:
self.results["module_usage"][f"scipy.{module}"] += 1
def run_analysis(
self, max_files_per_pattern: int = 200, output_dir: str = "scipy_usage_results"
) -> None:
"""
Run the complete analysis pipeline.
Args:
max_files_per_pattern: Maximum files to analyze per search pattern
output_dir: Directory to save results
"""
logger.info("Starting SciPy usage analysis with GitHub GraphQL API...")
# Search for scipy patterns
search_results = self.search_scipy_patterns(max_files_per_pattern)
# Analyze all found files
total_files = sum(len(files) for files in search_results.values())
logger.info("Analyzing %d files...", total_files)
analyzed_files: set[str] = set() # To avoid duplicates
for pattern, files in search_results.items():
logger.info("Processing %d files from pattern: %s", len(files), pattern)
for file_info in files:
file_key = f"{file_info['repo_name']}-{hash(file_info['content'])}"
if file_key not in analyzed_files:
self.analyze_file_content(
file_info["content"], file_info["repo_name"]
)
analyzed_files.add(file_key)
# Save results
self.save_results(output_dir)
self.print_summary()
logger.info("Analysis complete!")
def save_results(self, output_dir: str = "scipy_usage_results") -> None:
"""
Save analysis results to files.
Args:
output_dir: Directory to save results
"""
os.makedirs(output_dir, exist_ok=True)
# Save module usage
module_df = pd.DataFrame(
[
(module, count)
for module, count in self.results["module_usage"].most_common()
],
columns=["module", "usage_count"],
)
module_df.to_csv(f"{output_dir}/scipy_module_usage.csv", index=False)
# Save function usage
function_df = pd.DataFrame(
[
(func, count)
for func, count in self.results["function_usage"].most_common()
],
columns=["function", "usage_count"],
)
function_df.to_csv(f"{output_dir}/scipy_function_usage.csv", index=False)
# Save import patterns
import_df = pd.DataFrame(
[
(pattern, count)
for pattern, count in self.results["import_patterns"].most_common()
],
columns=["import_pattern", "usage_count"],
)
import_df.to_csv(f"{output_dir}/scipy_import_patterns.csv", index=False)
# Save summary statistics
summary = {
"analysis_date": datetime.now().isoformat(),
"total_repositories": len(self.results["repositories"]),
"total_files_analyzed": self.results["files_analyzed"],
"unique_modules": len(self.results["module_usage"]),
"unique_functions": len(self.results["function_usage"]),
"api_requests_made": self.requests_made,
"top_10_modules": dict(self.results["module_usage"].most_common(10)),
"top_20_functions": dict(self.results["function_usage"].most_common(20)),
"top_import_patterns": dict(
self.results["import_patterns"].most_common(10)
),
}
with open(f"{output_dir}/scipy_usage_summary.json", "w") as f:
json.dump(summary, f, indent=2)
logger.info("Results saved to %s/", output_dir)
def print_summary(self) -> None:
"""Print a summary of the analysis results."""
print("\n=== SciPy Usage Analysis Summary ===")
print(f"Total repositories analyzed: {len(self.results['repositories']):,}")
print(f"Total Python files analyzed: {self.results['files_analyzed']:,}")
print(f"Unique scipy modules found: {len(self.results['module_usage'])}")
print(f"Unique scipy functions found: {len(self.results['function_usage'])}")
print(f"API requests made: {self.requests_made}")
print("\n=== Top 10 Most Used SciPy Modules ===")
for module, count in self.results["module_usage"].most_common(10):
print(f"{module}: {count:,} occurrences")
print("\n=== Top 20 Most Used SciPy Functions ===")
for function, count in self.results["function_usage"].most_common(20):
print(f"{function}: {count:,} occurrences")
print("\n=== Most Common Import Patterns ===")
for pattern, count in self.results["import_patterns"].most_common(10):
print(f"{pattern}: {count:,} occurrences")
def main() -> None:
"""Main function to run the analysis."""
# Configuration
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") # set your GitHub token
MAX_FILES_PER_PATTERN = 200 # Adjust based on your needs
OUTPUT_DIR = "scipy_usage_results"
if not GITHUB_TOKEN:
print("Error: GITHUB_TOKEN environment variable not set")
print(
"Please create a GitHub Personal Access Token and set it as an environment variable"
)
return
# Initialize analyzer
analyzer = GitHubSciPyAnalyzer(GITHUB_TOKEN)
# Run analysis
analyzer.run_analysis(MAX_FILES_PER_PATTERN, OUTPUT_DIR)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment