Skip to content

Instantly share code, notes, and snippets.

@greister
Created August 11, 2016 10:31
Show Gist options
  • Save greister/72c9c0bf3c78d7745bce146f8de2a0a7 to your computer and use it in GitHub Desktop.
Save greister/72c9c0bf3c78d7745bce146f8de2a0a7 to your computer and use it in GitHub Desktop.
Use regex extracting urls from text file.
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 11 15:55:44 2016
@author: Cat
"""
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import requests
#import urlextractor
import re
import string
# utils.py
def regex_url(strng):
print("Start:" + strng + "inputting...\n")
REGEXEN = {} # :nodoc:
# URL related hash regex collection
REGEXEN['valid_preceding_chars'] = re.compile("(?:[^\/\"':!=]|^|\:)")
punct = re.escape(string.punctuation)
REGEXEN['valid_domain'] = re.compile('(?:[^%s\s][\.-](?=[^%s\s])|[^%s\s]){1,}\.[a-z]{2,}(?::[0-9]+)?' % (punct, punct, punct), re.IGNORECASE)
REGEXEN['valid_url_path_chars'] = re.compile('[\.\,]?[a-z0-9!\*\'\(\);:=\+\$\/%#\[\]\-_,~@\.]', re.IGNORECASE)
# Valid end-of-path chracters (so /foo. does not gobble the period).
# 1. Allow ) for Wikipedia URLs.
# 2. Allow =&# for empty URL parameters and other URL-join artifacts
REGEXEN['valid_url_path_ending_chars'] = re.compile('[a-z0-9\)=#\/]', re.IGNORECASE)
REGEXEN['valid_url_query_chars'] = re.compile('[a-z0-9!\*\'\(\);:&=\+\$\/%#\[\]\-_\.,~]', re.IGNORECASE)
REGEXEN['valid_url_query_ending_chars'] = re.compile('[a-z0-9_&=#]', re.IGNORECASE)
REGEXEN['valid_url'] = re.compile('''
(
(https?:\/\/|www\.|bit\.ly)
(%s)
(/%s*%s?)?
(\?%s*%s)?
)
''' % (
REGEXEN['valid_domain'].pattern,
REGEXEN['valid_url_path_chars'].pattern,
REGEXEN['valid_url_path_ending_chars'].pattern,
REGEXEN['valid_url_query_chars'].pattern,
REGEXEN['valid_url_query_ending_chars'].pattern
),
re.IGNORECASE + re.X)
strng = REGEXEN['valid_url'].search(strng)
if strng:
print("The url is {} ".format(strng.group(0)))
else:
print("None \n")
return strng
def extractUrl(text, match):
pretld, posttld = None, None
url = ""
tld = match[1]
startpt, endpt = match[0][0], match[0][1]
# check the next character is valid
if len(text) > endpt:
nextcharacter = text[endpt]
if re.match("[a-z0-9-.]", nextcharacter):
return None
posttld = re.match(':?[0-9]*[/[!#$&-;=?a-z]+]?', text[endpt:])
pretld = re.search('[a-z0-9-.]+?$', text[:startpt])
if pretld:
url = pretld.group(0)
startpt -= len(pretld.group(0))
url += tld
if posttld:
url += posttld.group(0)
endpt += len(posttld.group(0))
# if it ends with a . or , strip it because it's probably unintentional
url = url.rstrip(",.")
return (startpt, endpt), url
def url_ok(url):
try:
r = requests.head(url)
return r.status_code == 200
except:
return False
def parse_file(url_file):
infile = open(url_file, 'r')
lines = infile.readlines()
for line in lines:
print(line)
if __name__ == '__main__':
with open('12.txt', 'r') as fo:
for line in fo:
#results = urlextractor.parseText(line)
results = regex_url(line)
if results:
url_ok(results.group(0))
print(results.group(0) + "-----")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment