Created
December 22, 2024 18:12
-
-
Save anandtripathi5/5d3a18084470d06d776f009f97140124 to your computer and use it in GitHub Desktop.
Analyze postgres database parallely to save time by given number of processes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import os | |
import time | |
import psycopg2 | |
import multiprocessing | |
# Define command-line arguments | |
parser = argparse.ArgumentParser(description='Analyze all tables in a PostgreSQL database in parallel.') | |
parser.add_argument('--host', type=str, required=True, help='The hostname of the PostgreSQL server') | |
parser.add_argument('--port', type=int, default=5432, help='The port number of the PostgreSQL server (default: 5432)') | |
parser.add_argument('--user', type=str, required=True, | |
help='The username to use when connecting to the PostgreSQL server') | |
parser.add_argument('--password', type=str, required=True, | |
help='The password to use when connecting to the PostgreSQL server') | |
parser.add_argument('--database', type=str, required=True, help='The name of the PostgreSQL database') | |
parser.add_argument('--processes', type=int, | |
help='The number of parallel jobs to run') | |
parser.add_argument('--verbose', action='store_true', help='Output logs to stdout') | |
# Parse command-line arguments | |
args = parser.parse_args() | |
# Define function to analyze a table | |
def analyze_table(table): | |
conn = psycopg2.connect(host=args.host, port=args.port, user=args.user, password=args.password, | |
database=args.database) | |
cur = conn.cursor() | |
cur.execute(f'ANALYZE VERBOSE {table}') | |
conn.commit() | |
cur.close() | |
conn.close() | |
print(f'Analyzed table {table} by {os.getpid()}') | |
# Get list of tables in the database | |
conn = psycopg2.connect(host=args.host, port=args.port, user=args.user, password=args.password, database=args.database) | |
cur = conn.cursor() | |
cur.execute( | |
"SELECT schemaname, tablename FROM pg_tables WHERE schemaname != 'pg_catalog' AND schemaname != 'information_schema'") | |
tables = [f'{schema}.{table}' for schema, table in cur.fetchall()] | |
cur.close() | |
conn.close() | |
print(f'Found {len(tables)} tables in database') | |
if __name__ == '__main__': | |
print("Analyze tables in parallel") | |
start_time = time.time() | |
with multiprocessing.Pool(processes=args.processes) as pool: | |
pool.map(analyze_table, tables) | |
print(f"Analyze completed in {int((time.time() - start_time)/60)} minutes") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment