Created
September 9, 2021 01:56
-
-
Save kakarukeys/51551cd1ad38bb77b0a849d929b7844c to your computer and use it in GitHub Desktop.
summary email harvesting
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from pymongo import MongoClient | |
from settings import DEST_DATABASE_URL | |
if __name__ == '__main__': | |
with open("CREDENTIALS") as f: | |
credentials = f.read().strip() | |
client = MongoClient(DEST_DATABASE_URL.replace("$(cat CREDENTIALS)", credentials)) | |
db = client.linkedin_data | |
coll = db.contact_raw | |
counts = {"total": 0, "with_email": 0} | |
for rec in coll.find({}, {"summary": 1}).sort("$natural", -1): | |
counts["total"] += 1 | |
if rec.get("summary"): | |
emails = re.findall(r'\S+@\w+\.\w+', str(rec["summary"])) | |
if emails: | |
print(emails) | |
counts["with_email"] += 1 | |
if counts["total"] % 1000 == 0: | |
print(counts) | |
# {'total': 6400000, 'with_email': 43571} | |
# 11,000,000 records, .006807969, 75k emails |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment