Created
November 30, 2023 22:01
-
-
Save skeptrunedev/47bc69cd06359ec40aca4701932ded5f to your computer and use it in GitHub Desktop.
python script to dedup a database by column
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import psycopg2 | |
from dotenv import load_dotenv | |
import os | |
import json | |
import requests | |
# Load the .env file | |
load_dotenv() | |
origin_db_url = os.environ.get("ORIGIN_DB_URL") | |
conn = psycopg2.connect(origin_db_url) | |
cur = conn.cursor() | |
conn2 = psycopg2.connect(origin_db_url) | |
cur2 = conn2.cursor() | |
cur.execute( | |
"""SELECT id | |
FROM card_metadata | |
WHERE qdrant_point_id IS NULL | |
AND metadata NOT IN ( | |
SELECT metadata | |
FROM card_metadata | |
GROUP BY metadata | |
HAVING COUNT(*) = 1 | |
);""" | |
) | |
while True: | |
# Fetch 20 rows | |
rows = cur.fetchmany(10000) | |
# If no more rows are available, break the loop | |
if not rows: | |
break | |
ids = [row[0] for row in rows] | |
# delete all the rows from the card_collisions table which have one of the ids as their value for card_metadata_id | |
# print the number of rows deleted | |
cur2.execute("""DELETE FROM card_collisions WHERE card_id IN %s""", (tuple(ids),)) | |
num_rows_deleted = cur2.rowcount | |
print(num_rows_deleted) | |
# delete all the rows from the card_metadata table which have one of the ids as their value for id | |
# print the number of rows deleted | |
cur2.execute("""DELETE FROM card_metadata WHERE id IN %s""", (tuple(ids),)) | |
num_rows_deleted = cur2.rowcount | |
print(num_rows_deleted) | |
# commit the changes | |
conn2.commit() | |
# Close the cursor and connection | |
cur.close() | |
conn.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment