Last active
April 6, 2017 11:15
-
-
Save chriswebb09/8dbd3eadbe75ca37bcb54465e29f5749 to your computer and use it in GitHub Desktop.
Python Requests Example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python* | |
# -*- coding: UTF-8 -*- | |
import requests | |
import sys | |
import re | |
class WebCrawler: | |
def __init__(self): | |
if sys.argv[1].startswith("http://") or sys.argv[1].startswith("https://"): | |
self.url = sys.argv[1] | |
else: | |
self.url = "https://" + sys.argv[1] | |
def request_resource(self): | |
r = requests.get(self.url) | |
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', r.text) | |
return urls | |
def main(): | |
crawl = WebCrawler() | |
new_data = crawl.request_resource() | |
if new_data is not None: | |
for url in new_data: | |
print(url) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment