Skip to content

Instantly share code, notes, and snippets.

@Luxter77
Last active February 4, 2021 19:31
Show Gist options
  • Save Luxter77/3b5e45232a88f1b224610eb86b70e635 to your computer and use it in GitHub Desktop.
Save Luxter77/3b5e45232a88f1b224610eb86b70e635 to your computer and use it in GitHub Desktop.
Script that uses TOR to scrap definitions from the urban dictionary v0 api
#!/usr/bin/env python3
# coding: utf-8
#
# urban->json.py
#
# Copyright 2021 Luxter77 <[email protected]>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
#
#
from fake_useragent import UserAgent
from stem.control import Controller
from pickle import UnpicklingError
from requests import Session
from tqdm.auto import tqdm
from pprint import pformat
from stem import Signal
import requests
import sys, os
import pickle
import json
# Api endpoint for urbandictionary, you probably don't need to change this
endpoint_pages = 'https://api.urbandictionary.com/v0/define'
# Start id and end id to be scrapped, (will scrape range(start_id, end_id))
# By default I'm scrapping the ones not in the 2016 dataset
# by [that one guy on reddit:tm:]
start_id = 9041434
end_id = 15684329
# Where to put temporal and final files
urban_folder = os.path.join('..', 'UrbanDictionary')
os.makedirs(urban_folder, exist_ok=True)
def d(outlist: list, outdir: str, naem: str = "latest") -> None:
'''Helper function to save progress to file'''
tqdm.write(f"Saving progress to disk!")
pickle.dump(outlist, open(os.path.join(outdir, f"{naem}.pkl"), "wb"))
pickle.dump(outlist, open(os.path.join(outdir, f"{naem}.pkl.bkp"), "wb"))
json.dump(outlist, open(os.path.join(outdir, f"{naem}.json"), "w"))
def n(c: Controller = None) -> Session:
'''Helper funciton to reset ip and get a new session'''
notTrusted = True
while notTrusted:
try:
c.signal(Signal.NEWNYM)
except AttributeError:
pass
s = requests.Session()
s.proxies = {"http": "socks5://localhost:9050", "https": "socks5://localhost:9050"}
s.headers.update({'User-Agent': UserAgent().random})
notTrusted = p(s)
return(s)
def p(s: Session) -> bool:
'''
Helper function to check the output node's reliability
(some output nodes are evil and should not be used)
'''
try:
cip = str(s.get("https://api.ipify.org/?format=json").json()['ip'])
except requests.exceptions.SSLError:
return(True)
except Exception as e:
cip = e.__repr__()
tqdm.write(f'SOMETHING HAPPENED! [{str(cip)}]')
return(True)
tqdm.write(f'NEW IP! {("[" + str(cip) + "]") if not("DOCTYPE" in str(cip)) else ""}')
return(False)
s = n()
# Store original starting point
ostart_id = start_id
# Try to load last saved progress, else start from scratch
try:
outlist = pickle.load(open(os.path.join(urban_folder, 'latest.pkl'), 'rb'))
start_id = int(outlist[-1]['defid']) + 1
except UnpicklingError:
print('Goddamnit')
outlist = pickle.load(open(os.path.join(urban_folder, 'latest.pkl.bkp'), 'rb'))
d(outlist, urban_folder)
except FileNotFoundError:
outlist = []
# The magic happens here
with Controller.from_port() as c:
c.authenticate(password='password')
for wid in tqdm(range(start_id, end_id), desc="Pulling Descriptions"):
try:
if not(wid % 20):
d(outlist, urban_folder)
s = n(c)
with tqdm(total=1, leave=False) as t:
t.set_description(f"[{str(wid)}]")
t.refresh()
j = None
while (j is None):
try:
j = s.get(endpoint_pages + '?defid=' + str(wid)).json()['list']
except KeyboardInterrupt as e:
raise(e)
except Exception as e:
s = n(c)
t.update(1)
if bool(j):
tqdm.write(f"[{str(wid)}/{str(end_id)}]: {pformat(j)}")
outlist.append(j[0])
except KeyboardInterrupt:
tqdm.write("Interrupted! Saving progress to file and exiting")
d(outlist, urban_folder)
sys.exit()
# Save the final product to named files
d(outlist, urban_folder, f"UD-{str(ostart_id)}-{str(end_id)}")
@Luxter77
Copy link
Author

Luxter77 commented Feb 4, 2021

Now the script can recuperate from unexpected deaths during dumping the pickle (like killing the parent, spamming ctrl+C or the system power dying)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment