Created
May 19, 2017 21:05
-
-
Save Stormix/8a1dae3d41b81736f1e1333e922404c2 to your computer and use it in GitHub Desktop.
hehe
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Configuring the server: | |
1.Install tor: | |
sudo apt-get update | |
sudo apt-get install tor | |
sudo /etc/init.d/tor restart | |
2. Generate password hash for Torrc : | |
tor --hash-password my_password #change my_password to something xD | |
copy the hash : "16:xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" | |
3. Configure Torrc | |
sudo nano /etc/tor/torrc #if you don't have nano , install it using sudo apt-get install nano | |
click "ctrl+w" and type 'ControlPort',press enter | |
Uncomment the following lines: | |
ControlPort 9051 | |
HashedControlPassword YOUR_HASH_GOES_HERE | |
CookieAuthentication 1 | |
save ur changes by clicking CTRL+X followed by 'Y' than Enter | |
4.Restart TOR: | |
sudo /etc/init.d/tor restart | |
5- Install Python Stem:(A python client to communicate with TOR) | |
sudo apt-get install python-stem | |
6- Install all of these using PIP: | |
- PySocks | |
- SocksiPy-branch | |
- stem | |
- pandas | |
- bs4 | |
You can run the script like the following : | |
python stormix.py <pagenumber> | |
(e.g: python stormix.py 2) | |
An output example : | |
###################################################### | |
A new identity was set, | |
Server's IP : xxx.xx.xx.xx | |
Tor's IP : 37.220.35.202 | |
Fetching HTML ... | |
Current Page : 2 | |
###################################################### | |
In order to get a new identity each time, the script must be executed again. | |
That's why I chose to pass the page number as an argument | |
""" | |
import sys | |
from stem import Signal | |
from stem.control import Controller | |
import urllib2, json | |
import socket | |
import socks | |
import time | |
# we pass the page number as an argument | |
page_number = sys.argv[1] | |
output_path = "page_{}.json".format(page_number) #te9di tsemi le fichier kima bghiti | |
def set_new_identity(passwd): | |
""" | |
Change Identity using TOR : | |
params: | |
passwd : str : was the password I've set in /etc/tor/torrc, after I used: tor --hash-password AnasStormix to hash it | |
P.S : The NEWNYM is the command that asks tor to create a new identity | |
""" | |
with Controller.from_port(port=9051) as controller: | |
controller.authenticate(password=passwd) | |
controller.signal(Signal.NEWNYM) | |
set_new_identity('AnasStormix') | |
# Fething the server's actual IP. | |
real_ip = urllib2.urlopen("http://checkip.amazonaws.com/").read() # Used to get server IP Before enabling Tor Sockets | |
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, '127.0.0.1', 9050) # Enabling Tor Sockets | |
socket.socket = socks.socksocket | |
# Fething Tors IP. | |
fake_ip = urllib2.urlopen("http://checkip.amazonaws.com/").read() # Used to get server IP After enabling Tor Sockets | |
print("A new identity was set, \n Server's IP : {} \n Tor's IP : {}".format(real_ip,fake_ip)) | |
# ---------------------------------------------------------------------------- | |
import pandas as pd | |
import json | |
from bs4 import BeautifulSoup | |
import urllib2 | |
# Redefined fetch function | |
def fetch_page_html(page="2"): | |
link = "https://www.f6s.com/account/dashboard?ss=1&sort=popularity&sort_dir=desc&all_startups=1&columns[]=markets&columns[]=location&columns[]=founders&columns[]=founded&columns[]=employees&page="+page+"&page_alt=1" | |
opener = urllib2.build_opener() | |
return opener.open(link).read() | |
bs = BeautifulSoup(fetch_page_html(page_number), "html.parser") | |
# Fetch company data | |
compagny_name = map(lambda link: link.get_text().strip(), bs.findAll("a", {"target" : "_blank"}, ".noline"))[1:] | |
description = map(lambda link: link.get_text().strip(), bs.select('.profile-description ')) | |
market = map(lambda link: link.get_text().strip(), bs.select('.col-1')) | |
location = map(lambda link: link.get_text().strip(), bs.select('.col-2')) | |
founders = map(lambda link: link.get_text().strip(), bs.select('.col-3')) | |
founded = map(lambda link: link.get_text().strip(), bs.select('.col-4')) | |
employees = map(lambda link: link.get_text().strip(), bs.select('.col-5')) | |
# Open the file "filename" in write ("w") mode | |
file = open(output_path, "w") | |
# just an example dictionary to be dumped into "filename" | |
output = {"compagny_name": compagny_name,"description": description,"market": market, "location": location, "founders": founders, "founded": founded, "employees": employees} | |
# dumps "output" encoded in the JSON format into "filename" | |
json.dump(output, file) | |
file.close() | |
#pd.read_json(output_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment