Last active
September 23, 2024 21:52
-
-
Save gordthompson/3e013c5023b9791b40fde9fedd032393 to your computer and use it in GitHub Desktop.
search DataFrame for null values that could trigger a row scan with fast_executemany
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
A DataFrame that contains a lot of "null" values (NaN, etc.) can cause a | |
significant performance penalty with fast_executemany=True. For details, see | |
https://github.com/mkleehammer/pyodbc/issues/741 | |
This routine scans the specified DataFrame to see how many "row scans" would be | |
required when using .to_sql() with fast_executemany=True. The first row must | |
always be scanned, but after that a re-scan will flush the current rows to the | |
server, increasing network traffic and slowing things down. | |
version 1.3 - 2023-10-21 | |
""" | |
from collections import OrderedDict | |
import pandas as pd | |
def test_df_for_rescans(df): | |
# we haven't scanned yet, so flag all columns as null | |
is_null = {col_name: True for col_name in df.columns} | |
rescan_count = -1 | |
triggered_by = dict() | |
first_one = True | |
for row_idx in df.index: | |
scan_needed = False | |
for col_name in df.columns: | |
if pd.notna(df.at[row_idx, col_name]) and is_null[col_name]: | |
scan_needed = True | |
if not first_one: | |
n = triggered_by.get(col_name, 0) | |
triggered_by[col_name] = n + 1 | |
first_one = False | |
break | |
if scan_needed: | |
rescan_count += 1 | |
is_null = { | |
col_name: pd.isna(df.at[row_idx, col_name]) | |
for col_name in df.columns | |
} | |
if not any(is_null.values()): | |
# all False, so no need to scan subsequent rows | |
break | |
return { | |
"total rows": len(df), | |
"re-scans": rescan_count, | |
"triggered by": dict( | |
OrderedDict( | |
sorted(triggered_by.items(), key=lambda t: t[1], reverse=True) | |
) | |
), | |
} | |
if __name__ == "__main__": | |
# example data | |
my_df = pd.DataFrame( | |
[(1, None), (None, 2), (3, None), (4, 4), (None, 5)], | |
columns=["col_1", "col_2"], | |
) | |
results = test_df_for_rescans(my_df) | |
print(results) | |
""" | |
{'total rows': 5, 're-scans': 3, 'triggered by': {'col_2': 2, 'col_1': 1}} | |
""" | |
""" | |
Notes: | |
Scans are triggered if a column value is non-null when its previous state | |
was recorded as null. | |
The first row always triggers a scan, so that does not count as a re-scan. | |
In the above example, the second and third rows trigger re-scans because | |
the None values traded places, and the fourth row triggers a re-scan | |
because the col_2 value changed from null to non-null. However, the fifth | |
row does not trigger a re-scan because the previous row did not contain | |
any null values. | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment