sz3n · August 4, 2022 16:12
diff --git a/get pastes b/get pastes
 #script for crawling and storing passwd dump on http://psbdmp.com/
 from bs4 import BeautifulSoup
 import requests
 from pymongo import MongoClient
 from dateutil import parser
 import re

 url = 'http://psbdmp.com/'
 client = MongoClient('localhost', 27017)
 db = client['psbdmp']
 #mongodb connexion handlers
 con = db['pastes']
 emails_con = db['emails']
 ips_con = db['ips']
 info_con = db['info']
 #regex for emails
 email_regex = re.compile('([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})', re.IGNORECASE)
 #regex for ips
 ip_regex = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')

 def find_ips_in_html(html):
 	if (html == None):
 		return set()
 	ips_set = set()
  for ip in ip_regex.findall(html):
    ips_set.add(ip)
  return ips_set

 def find_emails_in_html(html):
 	if (html == None):
 		return set()
 	email_set = set()
 	for email in email_regex.findall(html):
 		email_set.add(email)
 	return email_set

 def get_last_paste_time():
  res=requests.get(url+'dumps/1')
  res=BeautifulSoup(res.content, "html.parser")
  tr = res.find('tr', {'class':None})
  time =  tr.findAll('td')[2].getText()
  return parser.parse(time)

 def update():
  try:
    last_paste_time = get_last_paste_time()
    print "updating"
  except Exception as e:
    print "error in getting time"
    
 def populate():
  res=requests.get(url+'dumps/')
  res=BeautifulSoup(res.content, "html.parser")
  lis = res.findAll('li', {'class':'next page'})
  last_page_num = lis[1].find_all('a')[0]['data-ci-pagination-page']
  
  #loop for extracting the pastes pages
  for n in range(1,int(last_page_num)):
    res=requests.get(url+'dumps/'+str(n))
    res=BeautifulSoup(res.content, "html.parser")
    trs = res.findAll('tr', {'class':None})
    for i in trs:
      link = i.findAll('td')[0].getText('href')
      time =  i.findAll('td')[2].getText()
      title = i.findAll('td')[1].getText()

      raw = requests.get(url+'api/dump/get/'+link)
      post = {"link":link,
              "time":time,
              "title":title,
              "raw":raw.content,
              "source":"pastebin"
             }
      con.insert_one(post)    
      #collect emails
      for email in find_emails_in_html(raw.content):
        email_post = {"link":link,
              "time":time,
              "title":title,
              "source":"pastebin",
              "email":email,
              "domain":email.split("@")[1]
             }
        emails_con.insert_one(email_post)
      #collect ips
      for ip in find_ips_in_html(raw.content):
        ip_post = {"link":link,
              "time":time,
              "title":title,
              "source":"pastebin",
              "ip":ip,
             }
        ips_con.insert_one(ip_post)
        ips_con.insert()
        
 argparser = argparse.ArgumentParser(description='Tool for populating or updating the Pastebins db')
 argparser.add_argument('--mode', required=True, choices = ['u','p'], help='Choose a mode: u(pdate) or p(opulate). Update the database: the datebase should have already been populated. info.db should exist at the current directory')

 args = argparser.parse_args()

 if __name__ =='__main__':
  if args.mode == "u":
    update()
  if args.mode == "p":
    populate()
	#script for crawling and storing passwd dump on http://psbdmp.com/
	from bs4 import BeautifulSoup
	import requests
	from pymongo import MongoClient
	from dateutil import parser
	import re

	url = 'http://psbdmp.com/'
	client = MongoClient('localhost', 27017)
	db = client['psbdmp']
	#mongodb connexion handlers
	con = db['pastes']
	emails_con = db['emails']
	ips_con = db['ips']
	info_con = db['info']
	#regex for emails
	email_regex = re.compile('([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})', re.IGNORECASE)
	#regex for ips
	ip_regex = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')

	def find_ips_in_html(html):
	if (html == None):
	return set()
	ips_set = set()
	for ip in ip_regex.findall(html):
	ips_set.add(ip)
	return ips_set

	def find_emails_in_html(html):
	if (html == None):
	return set()
	email_set = set()
	for email in email_regex.findall(html):
	email_set.add(email)
	return email_set

	def get_last_paste_time():
	res=requests.get(url+'dumps/1')
	res=BeautifulSoup(res.content, "html.parser")
	tr = res.find('tr', {'class':None})
	time = tr.findAll('td')[2].getText()
	return parser.parse(time)

	def update():
	try:
	last_paste_time = get_last_paste_time()
	print "updating"
	except Exception as e:
	print "error in getting time"

	def populate():
	res=requests.get(url+'dumps/')
	res=BeautifulSoup(res.content, "html.parser")
	lis = res.findAll('li', {'class':'next page'})
	last_page_num = lis[1].find_all('a')[0]['data-ci-pagination-page']

	#loop for extracting the pastes pages
	for n in range(1,int(last_page_num)):
	res=requests.get(url+'dumps/'+str(n))
	res=BeautifulSoup(res.content, "html.parser")
	trs = res.findAll('tr', {'class':None})
	for i in trs:
	link = i.findAll('td')[0].getText('href')
	time = i.findAll('td')[2].getText()
	title = i.findAll('td')[1].getText()

	raw = requests.get(url+'api/dump/get/'+link)
	post = {"link":link,
	"time":time,
	"title":title,
	"raw":raw.content,
	"source":"pastebin"
	}
	con.insert_one(post)
	#collect emails
	for email in find_emails_in_html(raw.content):
	email_post = {"link":link,
	"time":time,
	"title":title,
	"source":"pastebin",
	"email":email,
	"domain":email.split("@")[1]
	}
	emails_con.insert_one(email_post)
	#collect ips
	for ip in find_ips_in_html(raw.content):
	ip_post = {"link":link,
	"time":time,
	"title":title,
	"source":"pastebin",
	"ip":ip,
	}
	ips_con.insert_one(ip_post)
	ips_con.insert()

	argparser = argparse.ArgumentParser(description='Tool for populating or updating the Pastebins db')
	argparser.add_argument('--mode', required=True, choices = ['u','p'], help='Choose a mode: u(pdate) or p(opulate). Update the database: the datebase should have already been populated. info.db should exist at the current directory')

	args = argparser.parse_args()

	if __name__ =='__main__':
	if args.mode == "u":
	update()
	if args.mode == "p":
	populate()