Skip to content

Instantly share code, notes, and snippets.

@Stormix
Created May 19, 2017 21:05
Show Gist options
  • Save Stormix/8a1dae3d41b81736f1e1333e922404c2 to your computer and use it in GitHub Desktop.
Save Stormix/8a1dae3d41b81736f1e1333e922404c2 to your computer and use it in GitHub Desktop.
hehe
"""
Configuring the server:
1.Install tor:
sudo apt-get update
sudo apt-get install tor
sudo /etc/init.d/tor restart
2. Generate password hash for Torrc :
tor --hash-password my_password #change my_password to something xD
copy the hash : "16:xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
3. Configure Torrc
sudo nano /etc/tor/torrc #if you don't have nano , install it using sudo apt-get install nano
click "ctrl+w" and type 'ControlPort',press enter
Uncomment the following lines:
ControlPort 9051
HashedControlPassword YOUR_HASH_GOES_HERE
CookieAuthentication 1
save ur changes by clicking CTRL+X followed by 'Y' than Enter
4.Restart TOR:
sudo /etc/init.d/tor restart
5- Install Python Stem:(A python client to communicate with TOR)
sudo apt-get install python-stem
6- Install all of these using PIP:
- PySocks
- SocksiPy-branch
- stem
- pandas
- bs4
You can run the script like the following :
python stormix.py <pagenumber>
(e.g: python stormix.py 2)
An output example :
######################################################
A new identity was set,
Server's IP : xxx.xx.xx.xx
Tor's IP : 37.220.35.202
Fetching HTML ...
Current Page : 2
######################################################
In order to get a new identity each time, the script must be executed again.
That's why I chose to pass the page number as an argument
"""
import sys
from stem import Signal
from stem.control import Controller
import urllib2, json
import socket
import socks
import time
# we pass the page number as an argument
page_number = sys.argv[1]
output_path = "page_{}.json".format(page_number) #te9di tsemi le fichier kima bghiti
def set_new_identity(passwd):
"""
Change Identity using TOR :
params:
passwd : str : was the password I've set in /etc/tor/torrc, after I used: tor --hash-password AnasStormix to hash it
P.S : The NEWNYM is the command that asks tor to create a new identity
"""
with Controller.from_port(port=9051) as controller:
controller.authenticate(password=passwd)
controller.signal(Signal.NEWNYM)
set_new_identity('AnasStormix')
# Fething the server's actual IP.
real_ip = urllib2.urlopen("http://checkip.amazonaws.com/").read() # Used to get server IP Before enabling Tor Sockets
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, '127.0.0.1', 9050) # Enabling Tor Sockets
socket.socket = socks.socksocket
# Fething Tors IP.
fake_ip = urllib2.urlopen("http://checkip.amazonaws.com/").read() # Used to get server IP After enabling Tor Sockets
print("A new identity was set, \n Server's IP : {} \n Tor's IP : {}".format(real_ip,fake_ip))
# ----------------------------------------------------------------------------
import pandas as pd
import json
from bs4 import BeautifulSoup
import urllib2
# Redefined fetch function
def fetch_page_html(page="2"):
link = "https://www.f6s.com/account/dashboard?ss=1&sort=popularity&sort_dir=desc&all_startups=1&columns[]=markets&columns[]=location&columns[]=founders&columns[]=founded&columns[]=employees&page="+page+"&page_alt=1"
opener = urllib2.build_opener()
return opener.open(link).read()
bs = BeautifulSoup(fetch_page_html(page_number), "html.parser")
# Fetch company data
compagny_name = map(lambda link: link.get_text().strip(), bs.findAll("a", {"target" : "_blank"}, ".noline"))[1:]
description = map(lambda link: link.get_text().strip(), bs.select('.profile-description '))
market = map(lambda link: link.get_text().strip(), bs.select('.col-1'))
location = map(lambda link: link.get_text().strip(), bs.select('.col-2'))
founders = map(lambda link: link.get_text().strip(), bs.select('.col-3'))
founded = map(lambda link: link.get_text().strip(), bs.select('.col-4'))
employees = map(lambda link: link.get_text().strip(), bs.select('.col-5'))
# Open the file "filename" in write ("w") mode
file = open(output_path, "w")
# just an example dictionary to be dumped into "filename"
output = {"compagny_name": compagny_name,"description": description,"market": market, "location": location, "founders": founders, "founded": founded, "employees": employees}
# dumps "output" encoded in the JSON format into "filename"
json.dump(output, file)
file.close()
#pd.read_json(output_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment