Last active
February 4, 2021 19:31
-
-
Save Luxter77/3b5e45232a88f1b224610eb86b70e635 to your computer and use it in GitHub Desktop.
Script that uses TOR to scrap definitions from the urban dictionary v0 api
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# coding: utf-8 | |
# | |
# urban->json.py | |
# | |
# Copyright 2021 Luxter77 <[email protected]> | |
# | |
# This program is free software; you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation; either version 2 of the License, or | |
# (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with this program; if not, write to the Free Software | |
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
# MA 02110-1301, USA. | |
# | |
# | |
from fake_useragent import UserAgent | |
from stem.control import Controller | |
from pickle import UnpicklingError | |
from requests import Session | |
from tqdm.auto import tqdm | |
from pprint import pformat | |
from stem import Signal | |
import requests | |
import sys, os | |
import pickle | |
import json | |
# Api endpoint for urbandictionary, you probably don't need to change this | |
endpoint_pages = 'https://api.urbandictionary.com/v0/define' | |
# Start id and end id to be scrapped, (will scrape range(start_id, end_id)) | |
# By default I'm scrapping the ones not in the 2016 dataset | |
# by [that one guy on reddit:tm:] | |
start_id = 9041434 | |
end_id = 15684329 | |
# Where to put temporal and final files | |
urban_folder = os.path.join('..', 'UrbanDictionary') | |
os.makedirs(urban_folder, exist_ok=True) | |
def d(outlist: list, outdir: str, naem: str = "latest") -> None: | |
'''Helper function to save progress to file''' | |
tqdm.write(f"Saving progress to disk!") | |
pickle.dump(outlist, open(os.path.join(outdir, f"{naem}.pkl"), "wb")) | |
pickle.dump(outlist, open(os.path.join(outdir, f"{naem}.pkl.bkp"), "wb")) | |
json.dump(outlist, open(os.path.join(outdir, f"{naem}.json"), "w")) | |
def n(c: Controller = None) -> Session: | |
'''Helper funciton to reset ip and get a new session''' | |
notTrusted = True | |
while notTrusted: | |
try: | |
c.signal(Signal.NEWNYM) | |
except AttributeError: | |
pass | |
s = requests.Session() | |
s.proxies = {"http": "socks5://localhost:9050", "https": "socks5://localhost:9050"} | |
s.headers.update({'User-Agent': UserAgent().random}) | |
notTrusted = p(s) | |
return(s) | |
def p(s: Session) -> bool: | |
''' | |
Helper function to check the output node's reliability | |
(some output nodes are evil and should not be used) | |
''' | |
try: | |
cip = str(s.get("https://api.ipify.org/?format=json").json()['ip']) | |
except requests.exceptions.SSLError: | |
return(True) | |
except Exception as e: | |
cip = e.__repr__() | |
tqdm.write(f'SOMETHING HAPPENED! [{str(cip)}]') | |
return(True) | |
tqdm.write(f'NEW IP! {("[" + str(cip) + "]") if not("DOCTYPE" in str(cip)) else ""}') | |
return(False) | |
s = n() | |
# Store original starting point | |
ostart_id = start_id | |
# Try to load last saved progress, else start from scratch | |
try: | |
outlist = pickle.load(open(os.path.join(urban_folder, 'latest.pkl'), 'rb')) | |
start_id = int(outlist[-1]['defid']) + 1 | |
except UnpicklingError: | |
print('Goddamnit') | |
outlist = pickle.load(open(os.path.join(urban_folder, 'latest.pkl.bkp'), 'rb')) | |
d(outlist, urban_folder) | |
except FileNotFoundError: | |
outlist = [] | |
# The magic happens here | |
with Controller.from_port() as c: | |
c.authenticate(password='password') | |
for wid in tqdm(range(start_id, end_id), desc="Pulling Descriptions"): | |
try: | |
if not(wid % 20): | |
d(outlist, urban_folder) | |
s = n(c) | |
with tqdm(total=1, leave=False) as t: | |
t.set_description(f"[{str(wid)}]") | |
t.refresh() | |
j = None | |
while (j is None): | |
try: | |
j = s.get(endpoint_pages + '?defid=' + str(wid)).json()['list'] | |
except KeyboardInterrupt as e: | |
raise(e) | |
except Exception as e: | |
s = n(c) | |
t.update(1) | |
if bool(j): | |
tqdm.write(f"[{str(wid)}/{str(end_id)}]: {pformat(j)}") | |
outlist.append(j[0]) | |
except KeyboardInterrupt: | |
tqdm.write("Interrupted! Saving progress to file and exiting") | |
d(outlist, urban_folder) | |
sys.exit() | |
# Save the final product to named files | |
d(outlist, urban_folder, f"UD-{str(ostart_id)}-{str(end_id)}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Now the script can recuperate from unexpected deaths during dumping the pickle (like killing the parent, spamming
ctrl
+C
or the system power dying)