Last active
August 4, 2022 16:12
-
-
Save sz3n/4f032037235992eaf3cdff2c2aeb1397 to your computer and use it in GitHub Desktop.
get pastes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#script for crawling and storing passwd dump on http://psbdmp.com/ | |
from bs4 import BeautifulSoup | |
import requests | |
from pymongo import MongoClient | |
from dateutil import parser | |
import re | |
url = 'http://psbdmp.com/' | |
client = MongoClient('localhost', 27017) | |
db = client['psbdmp'] | |
#mongodb connexion handlers | |
con = db['pastes'] | |
emails_con = db['emails'] | |
ips_con = db['ips'] | |
info_con = db['info'] | |
#regex for emails | |
email_regex = re.compile('([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})', re.IGNORECASE) | |
#regex for ips | |
ip_regex = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') | |
def find_ips_in_html(html): | |
if (html == None): | |
return set() | |
ips_set = set() | |
for ip in ip_regex.findall(html): | |
ips_set.add(ip) | |
return ips_set | |
def find_emails_in_html(html): | |
if (html == None): | |
return set() | |
email_set = set() | |
for email in email_regex.findall(html): | |
email_set.add(email) | |
return email_set | |
def get_last_paste_time(): | |
res=requests.get(url+'dumps/1') | |
res=BeautifulSoup(res.content, "html.parser") | |
tr = res.find('tr', {'class':None}) | |
time = tr.findAll('td')[2].getText() | |
return parser.parse(time) | |
def update(): | |
try: | |
last_paste_time = get_last_paste_time() | |
print "updating" | |
except Exception as e: | |
print "error in getting time" | |
def populate(): | |
res=requests.get(url+'dumps/') | |
res=BeautifulSoup(res.content, "html.parser") | |
lis = res.findAll('li', {'class':'next page'}) | |
last_page_num = lis[1].find_all('a')[0]['data-ci-pagination-page'] | |
#loop for extracting the pastes pages | |
for n in range(1,int(last_page_num)): | |
res=requests.get(url+'dumps/'+str(n)) | |
res=BeautifulSoup(res.content, "html.parser") | |
trs = res.findAll('tr', {'class':None}) | |
for i in trs: | |
link = i.findAll('td')[0].getText('href') | |
time = i.findAll('td')[2].getText() | |
title = i.findAll('td')[1].getText() | |
raw = requests.get(url+'api/dump/get/'+link) | |
post = {"link":link, | |
"time":time, | |
"title":title, | |
"raw":raw.content, | |
"source":"pastebin" | |
} | |
con.insert_one(post) | |
#collect emails | |
for email in find_emails_in_html(raw.content): | |
email_post = {"link":link, | |
"time":time, | |
"title":title, | |
"source":"pastebin", | |
"email":email, | |
"domain":email.split("@")[1] | |
} | |
emails_con.insert_one(email_post) | |
#collect ips | |
for ip in find_ips_in_html(raw.content): | |
ip_post = {"link":link, | |
"time":time, | |
"title":title, | |
"source":"pastebin", | |
"ip":ip, | |
} | |
ips_con.insert_one(ip_post) | |
ips_con.insert() | |
argparser = argparse.ArgumentParser(description='Tool for populating or updating the Pastebins db') | |
argparser.add_argument('--mode', required=True, choices = ['u','p'], help='Choose a mode: u(pdate) or p(opulate). Update the database: the datebase should have already been populated. info.db should exist at the current directory') | |
args = argparser.parse_args() | |
if __name__ =='__main__': | |
if args.mode == "u": | |
update() | |
if args.mode == "p": | |
populate() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment