-
-
Save serif/a1281c676cf5a1f77af6ff1a25255a85 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
# updated 2023-11-27 | |
# updated 2023-10-12 | |
# updated 2021 | |
# updated 2020 | |
# created 2018 | |
import sys | |
import hashlib | |
from urllib.parse import urlparse | |
def main(argv): | |
# Fields in Bitwarden CSV | |
f = 'folder,favorite,type,name,notes,fields,reprompt,login_uri,login_username,login_password,login_totp'.split(',') | |
if len(argv) < 1: | |
sys.exit('Supply input file path as command argument') | |
in_path = argv[0] | |
csv = '.csv' | |
csv_out = '_out' + csv | |
csv_rem = '_rem' + csv | |
out_path = in_path.replace(csv, csv_out) | |
rem_path = in_path.replace(csv, csv_rem) | |
completed_lines_hash = set() | |
line_number = -1 | |
write_count = 0 | |
cache = '' | |
# Process file | |
with open(out_path, 'w', encoding='utf8') as out_file, \ | |
open(rem_path, 'w', encoding='utf8') as rem_file, \ | |
open(in_path, 'r', encoding='utf8') as in_file: | |
for line in in_file: | |
line_number += 1 | |
# Validate .csv format | |
if line_number == 0 and not line.strip() == ','.join(f): | |
print('\nBitwarden CSV format has changed.') | |
print('Contact author for update.') | |
exit(1) | |
# Skip empty lines | |
if not line.strip(): | |
continue | |
fields = line.split(',') | |
# If the line has fewer fields than expected, | |
# try to combine with the previous line | |
if len(fields) < len(f): | |
# Add previous line if short | |
line = cache.strip('\n') + line | |
cache = line | |
fields = line.split(',') | |
if len(fields) == len(f): | |
print(f'Recovered with line {line_number}:\n{line}') | |
cache = '' | |
else: | |
print(f'Missing fields in line {line_number}:\n{line}') | |
rem_file.write(line) | |
continue | |
else: | |
cache = '' | |
# Generate an MD5 hash based on login URI, username, and password | |
if line_number != 0: | |
domain = urlparse(fields[f.index('login_uri')]).netloc | |
if len(domain) > 0: | |
fields[f.index('login_uri')] = domain | |
token = fields[f.index('login_uri')] | |
token += fields[f.index('login_username')] | |
token += fields[f.index('login_password')] | |
hashValue = hashlib.md5(token.rstrip().encode('utf-8')).hexdigest() | |
# Write entry | |
if hashValue not in completed_lines_hash: | |
out_file.write(line) | |
completed_lines_hash.add(hashValue) | |
write_count += 1 | |
else: | |
rem_file.write(line) | |
# print(f'Duplicate on line {line_number}:\n{line}') | |
# Report | |
dup_count = line_number - write_count | |
print(f'\nOutput file: {out_path}\n{write_count} unique entries saved') | |
print(f'\n{dup_count} duplicates saved to {rem_path}') | |
if __name__ == "__main__": | |
main(sys.argv[1:]) |
I have managed to import a file with multi-line entries using some modifications (full code below). It also works for me when there are curly braces inside password fields. The only issue is multiple deleted "lines" are created in the _rem file for each iteration going through the multi-line portion.
#!/usr/bin/env python3
# updated 2024-02-07
# updated 2023-11-27
# updated 2023-10-12
# updated 2021
# updated 2020
# created 2018
import hashlib
import sys
from urllib.parse import urlparse
# With a little help from...
# https://stackoverflow.com/questions/29375614/how-to-get-csv-reader-to-ignore-commas-within-braces-curly-square-angle
l_braces = {"{"}
r_braces = {"}"}
def split(s):
brace_count = 0
quote_count = 0
breaks = []
for i, c in enumerate(s):
if c == '"':
quote_count += 1
if quote_count % 2 == 1:
brace_count += 1
else:
brace_count -= 1
if c in l_braces:
brace_count += 1
if c in r_braces:
brace_count -= 1
if (c in [","]) and (brace_count == 0):
breaks.append(i)
pieces = []
lag = 0
for b in breaks:
pieces.append(s[lag:b].strip())
lag = b + 1
try:
pieces.append(s[breaks[-1] + 1 :].strip())
except IndexError:
pieces = s.split(",")
return pieces
def main(argv):
# Fields in Bitwarden CSV
f = "folder,favorite,type,name,notes,fields,reprompt,login_uri,login_username,login_password,login_totp".split(",")
if len(argv) < 1:
sys.exit("Supply input file path as command argument")
in_path = argv[0]
csv = ".csv"
csv_out = "_out" + csv
csv_rem = "_rem" + csv
out_path = in_path.replace(csv, csv_out)
rem_path = in_path.replace(csv, csv_rem)
completed_lines_hash = set()
line_number = -1
write_count = 0
cache = ""
# Process file
with open(out_path, "w", encoding="utf8") as out_file, open(
rem_path, "w", encoding="utf8"
) as rem_file, open(in_path, "r", encoding="utf8") as in_file:
for line in in_file:
line_number += 1
# Validate .csv format
if line_number == 0 and not line.strip() == ",".join(f):
print("\nBitwarden CSV format has changed.")
print("Contact author for update.")
exit(1)
# Skip empty lines
if not line.strip():
continue
fields = split(line)
# If the line has fewer fields than expected,
# try to combine with the previous line
if len(fields) < len(f):
# Add previous line if short
line = cache + line
cache = line
fields = split(line)
if len(fields) == len(f):
print(f"Recovered with line {line_number}:\n{line}")
cache = ""
else:
print(f"Missing fields in line {line_number}:\n{line}")
rem_file.write(line)
continue
else:
cache = ""
# Generate an MD5 hash based on login URI, username, and password
if line_number != 0:
domain = urlparse(fields[f.index("login_uri")]).netloc
if len(domain) > 0:
fields[f.index("login_uri")] = domain
token = fields[f.index("login_uri")]
token += fields[f.index("login_username")]
token += fields[f.index("login_password")]
if token == "":
token = fields[f.index("notes")]
hashValue = hashlib.md5(token.rstrip().encode("utf-8")).hexdigest()
# Write entry
if hashValue not in completed_lines_hash:
out_file.write(line)
completed_lines_hash.add(hashValue)
write_count += 1
else:
rem_file.write(line)
# print(f'Duplicate on line {line_number}:\n{line}')
# Report
dup_count = line_number - write_count
print(f"\nOutput file: {out_path}\n{write_count} unique entries saved")
print(f"\n{dup_count} duplicates saved to {rem_path}")
if __name__ == "__main__":
main(sys.argv[1:])
To anyone new looking for a script that can, in addition to removing duplicates, also helps you find and get rid of any old copies of passwords to websites (where you have the same username but old and new passwords) this script that i wrote can help
This uses the bitwarden json export, preserving more data, including properly preserving notes. It also keeps the newest entry if multiple exist. Consider merging this with @howird's logic above for the best combination
https://gist.github.com/topisani/066b63b87346afe76ffdf0998d4ebc2f
Thank you for providing this code. Unfortunately, it fails with multiline entries created by latest bitwarden on Windows 10.