Created
June 15, 2021 20:26
-
-
Save epilys/b78fe285a2647ece8689f7c7c1bca90c to your computer and use it in GitHub Desktop.
scrape django pages with support for csrf POST forms and login required views with just python3 standard library
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Login to website using just Python 3 Standard Library | |
import urllib.parse | |
import urllib.request | |
import http.cookiejar | |
def scraper(): | |
# here goes URL that's found inside form action='.....' | |
# adjust as needed, can be all kinds of weird stuff | |
authentication_url = '/admin/login/' | |
# username and password for login | |
username = 'epilys' | |
password = 'password' | |
# initiate the cookie jar (using : http.cookiejar and urllib.request) | |
cookie_jar = http.cookiejar.CookieJar() | |
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie_jar)) | |
urllib.request.install_opener(opener) | |
def make_get(url): | |
####### change variables here, like URL, action URL, user, pass | |
# your base URL here, will be used for headers and such, with and without https:// | |
base_url = '127.0.0.1:8000' | |
https_base_url = 'http://127.0.0.1:8000' | |
url = https_base_url + url | |
####### rest of the script is logic | |
# but you will need to tweak couple things maybe regarding "token" logic | |
# (can be _token or token or _token_ or secret ... etc) | |
# big thing! you need a referer for most pages! and correct headers are the key | |
headers={"Content-Type":"application/x-www-form-urlencoded", | |
"User-agent":"Mozilla/5.0 Chrome/81.0.4044.92", # Chrome 80+ as per web search | |
"Host":base_url, | |
"Origin":https_base_url, | |
"Referer":https_base_url} | |
# first a simple request, just to get login page and parse out the token | |
# (using : urllib.request) | |
request = urllib.request.Request(url) | |
response = urllib.request.urlopen(request) | |
contents = response.read() | |
html = contents.decode("utf-8") | |
return html | |
def make_post(url, payload): | |
####### change variables here, like URL, action URL, user, pass | |
# your base URL here, will be used for headers and such, with and without https:// | |
base_url = '127.0.0.1:8000' | |
https_base_url = 'http://127.0.0.1:8000' | |
url = https_base_url + url | |
####### rest of the script is logic | |
# but you will need to tweak couple things maybe regarding "token" logic | |
# (can be _token or token or _token_ or secret ... etc) | |
# big thing! you need a referer for most pages! and correct headers are the key | |
headers={"Content-Type":"application/x-www-form-urlencoded", | |
"User-agent":"Mozilla/5.0 Chrome/81.0.4044.92", # Chrome 80+ as per web search | |
"Host":base_url, | |
"Origin":https_base_url, | |
"Referer":https_base_url} | |
# first a simple request, just to get login page and parse out the token | |
# (using : urllib.request) | |
request = urllib.request.Request(url) | |
response = urllib.request.urlopen(request) | |
contents = response.read() | |
# parse the page, we look for token eg. on my page it was something like this: | |
# <input type="hidden" name="_token" value="random1234567890qwertzstring"> | |
# this can probably be done better with regex and similar | |
# but I'm newb, so bear with me | |
html = contents.decode("utf-8") | |
# text just before start and just after end of your token string | |
mark_start = '<input type="hidden" name="csrfmiddlewaretoken" value="' | |
mark_end = '">' | |
# index of those two points | |
start_index = html.find(mark_start) + len(mark_start) | |
end_index = html.find(mark_end, start_index) | |
# and text between them is our token, store it for second step of actual login | |
token = html[start_index:end_index] | |
# here we craft our payload, it's all the form fields, including HIDDEN fields! | |
# that includes token we scraped earler, as that's usually in hidden fields | |
# make sure left side is from "name" attributes of the form, | |
# and right side is what you want to post as "value" | |
# and for hidden fields make sure you replicate the expected answer, | |
# eg. "token" or "yes I agree" checkboxes and such | |
payload['csrfmiddlewaretoken'] = token | |
# now we prepare all we need for login | |
# data - with our payload (user/pass/token) urlencoded and encoded as bytes | |
data = urllib.parse.urlencode(payload) | |
binary_data = data.encode('UTF-8') | |
# and put the URL + encoded data + correct headers into our POST request | |
# btw, despite what I thought it is automatically treated as POST | |
# I guess because of byte encoded data field you don't need to say it like this: | |
# urllib.request.Request(authentication_url, binary_data, headers, method='POST') | |
request = urllib.request.Request(url, binary_data, headers) | |
response = urllib.request.urlopen(request) | |
contents = response.read() | |
contents = contents.decode("utf-8") | |
return contents | |
login_data = { | |
#'name':'value', # make sure this is the format of all additional fields ! | |
'username':username, | |
'password':password, | |
'next': "/", | |
} | |
contents = make_post(authentication_url, login_data) | |
# just for kicks, we confirm some element in the page that's secure behind the login | |
# we use a particular string we know only occurs after login, | |
# like "logout" or "welcome" or "member", etc. I found "Logout" is pretty safe so far | |
#with open('_scrape/index.html', 'w') as f: | |
# f.write(contents) | |
""" | |
page = "/?full_text_query=abolition+of+private+property" | |
html = make_get(page) | |
filename = page.replace('/', '_').replace('?', '_') + '.html' | |
print(filename) | |
with open('_scrape/'+filename, 'w') as f: | |
f.write(html) | |
change_layout_data = { "layout": "table", "change-layout": "change" } | |
make_post("", change_layout_data) | |
page = "/?full_text_query=abolition+of+private+property" | |
html = make_get(page) | |
filename = 'table_'+page.replace('/', '_').replace('?', '_') + '.html' | |
print(filename) | |
with open('_scrape/'+filename, 'w') as f: | |
f.write(html) | |
""" | |
""" | |
doc_page = "/document/cefc21a0-f7ab-46ae-8824-46d254ba053d/" | |
html = make_get(doc_page) | |
filename = 'doc_'+doc_page.replace('/', '_').replace('?', '_') + '.html' | |
print(filename) | |
with open('_scrape/'+filename, 'w') as f: | |
f.write(html) | |
""" | |
change_layout_data = { "layout": "grid", "change-layout": "change" } | |
make_post("", change_layout_data) | |
page = "" | |
html = make_get(page) | |
filename = 'grid.html' | |
print(filename) | |
with open('web-demo/'+filename, 'w') as f: | |
f.write(html) | |
""" | |
for page in [ | |
("get", "/database/"), | |
("get", "/tag/"), | |
("get", "/tag_d3/"), | |
("post", "", { "layout": "table", "change-layout": "change" }), | |
]: | |
print(page) | |
if page[0] is "get": | |
html = make_get(page[1]) | |
else: | |
html = make_post(page[1], page[2]) | |
filename = page[1].replace('/', '_') + '.html' | |
print(filename) | |
with open('_scrape/'+filename, 'w') as f: | |
f.write(html) | |
""" | |
scraper() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment