Skip to content

Instantly share code, notes, and snippets.

@chriswebb09
Last active April 6, 2017 11:15
Show Gist options
  • Save chriswebb09/8dbd3eadbe75ca37bcb54465e29f5749 to your computer and use it in GitHub Desktop.
Save chriswebb09/8dbd3eadbe75ca37bcb54465e29f5749 to your computer and use it in GitHub Desktop.
Python Requests Example
#!/usr/bin/env python*
# -*- coding: UTF-8 -*-
import requests
import sys
import re
class WebCrawler:
def __init__(self):
if sys.argv[1].startswith("http://") or sys.argv[1].startswith("https://"):
self.url = sys.argv[1]
else:
self.url = "https://" + sys.argv[1]
def request_resource(self):
r = requests.get(self.url)
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', r.text)
return urls
def main():
crawl = WebCrawler()
new_data = crawl.request_resource()
if new_data is not None:
for url in new_data:
print(url)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment