Created
August 11, 2016 10:31
-
-
Save greister/72c9c0bf3c78d7745bce146f8de2a0a7 to your computer and use it in GitHub Desktop.
Use regex extracting urls from text file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Thu Aug 11 15:55:44 2016 | |
@author: Cat | |
""" | |
#!/usr/bin/python | |
# -*- coding: UTF-8 -*- | |
import requests | |
#import urlextractor | |
import re | |
import string | |
# utils.py | |
def regex_url(strng): | |
print("Start:" + strng + "inputting...\n") | |
REGEXEN = {} # :nodoc: | |
# URL related hash regex collection | |
REGEXEN['valid_preceding_chars'] = re.compile("(?:[^\/\"':!=]|^|\:)") | |
punct = re.escape(string.punctuation) | |
REGEXEN['valid_domain'] = re.compile('(?:[^%s\s][\.-](?=[^%s\s])|[^%s\s]){1,}\.[a-z]{2,}(?::[0-9]+)?' % (punct, punct, punct), re.IGNORECASE) | |
REGEXEN['valid_url_path_chars'] = re.compile('[\.\,]?[a-z0-9!\*\'\(\);:=\+\$\/%#\[\]\-_,~@\.]', re.IGNORECASE) | |
# Valid end-of-path chracters (so /foo. does not gobble the period). | |
# 1. Allow ) for Wikipedia URLs. | |
# 2. Allow =&# for empty URL parameters and other URL-join artifacts | |
REGEXEN['valid_url_path_ending_chars'] = re.compile('[a-z0-9\)=#\/]', re.IGNORECASE) | |
REGEXEN['valid_url_query_chars'] = re.compile('[a-z0-9!\*\'\(\);:&=\+\$\/%#\[\]\-_\.,~]', re.IGNORECASE) | |
REGEXEN['valid_url_query_ending_chars'] = re.compile('[a-z0-9_&=#]', re.IGNORECASE) | |
REGEXEN['valid_url'] = re.compile(''' | |
( | |
(https?:\/\/|www\.|bit\.ly) | |
(%s) | |
(/%s*%s?)? | |
(\?%s*%s)? | |
) | |
''' % ( | |
REGEXEN['valid_domain'].pattern, | |
REGEXEN['valid_url_path_chars'].pattern, | |
REGEXEN['valid_url_path_ending_chars'].pattern, | |
REGEXEN['valid_url_query_chars'].pattern, | |
REGEXEN['valid_url_query_ending_chars'].pattern | |
), | |
re.IGNORECASE + re.X) | |
strng = REGEXEN['valid_url'].search(strng) | |
if strng: | |
print("The url is {} ".format(strng.group(0))) | |
else: | |
print("None \n") | |
return strng | |
def extractUrl(text, match): | |
pretld, posttld = None, None | |
url = "" | |
tld = match[1] | |
startpt, endpt = match[0][0], match[0][1] | |
# check the next character is valid | |
if len(text) > endpt: | |
nextcharacter = text[endpt] | |
if re.match("[a-z0-9-.]", nextcharacter): | |
return None | |
posttld = re.match(':?[0-9]*[/[!#$&-;=?a-z]+]?', text[endpt:]) | |
pretld = re.search('[a-z0-9-.]+?$', text[:startpt]) | |
if pretld: | |
url = pretld.group(0) | |
startpt -= len(pretld.group(0)) | |
url += tld | |
if posttld: | |
url += posttld.group(0) | |
endpt += len(posttld.group(0)) | |
# if it ends with a . or , strip it because it's probably unintentional | |
url = url.rstrip(",.") | |
return (startpt, endpt), url | |
def url_ok(url): | |
try: | |
r = requests.head(url) | |
return r.status_code == 200 | |
except: | |
return False | |
def parse_file(url_file): | |
infile = open(url_file, 'r') | |
lines = infile.readlines() | |
for line in lines: | |
print(line) | |
if __name__ == '__main__': | |
with open('12.txt', 'r') as fo: | |
for line in fo: | |
#results = urlextractor.parseText(line) | |
results = regex_url(line) | |
if results: | |
url_ok(results.group(0)) | |
print(results.group(0) + "-----") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment