greister · August 11, 2016 10:31
diff --git a/gistfile1.txt b/gistfile1.txt
 # -*- coding: utf-8 -*-
 """
 Created on Thu Aug 11 15:55:44 2016

 @author: Cat
 """

 #!/usr/bin/python
 # -*- coding: UTF-8 -*-
 import requests
 #import urlextractor
 import re
 import string


 # utils.py


 def regex_url(strng):
    print("Start:" + strng + "inputting...\n")

    REGEXEN = {} # :nodoc:

    # URL related hash regex collection
    REGEXEN['valid_preceding_chars'] = re.compile("(?:[^\/\"':!=]|^|\:)")
    punct = re.escape(string.punctuation)
    REGEXEN['valid_domain'] = re.compile('(?:[^%s\s][\.-](?=[^%s\s])|[^%s\s]){1,}\.[a-z]{2,}(?::[0-9]+)?' % (punct, punct, punct), re.IGNORECASE)
    REGEXEN['valid_url_path_chars'] = re.compile('[\.\,]?[a-z0-9!\*\'\(\);:=\+\$\/%#\[\]\-_,~@\.]', re.IGNORECASE)
    # Valid end-of-path chracters (so /foo. does not gobble the period).
    #   1. Allow ) for Wikipedia URLs.
    #   2. Allow =&# for empty URL parameters and other URL-join artifacts
    REGEXEN['valid_url_path_ending_chars'] = re.compile('[a-z0-9\)=#\/]', re.IGNORECASE)
    REGEXEN['valid_url_query_chars'] = re.compile('[a-z0-9!\*\'\(\);:&=\+\$\/%#\[\]\-_\.,~]', re.IGNORECASE)
    REGEXEN['valid_url_query_ending_chars'] = re.compile('[a-z0-9_&=#]', re.IGNORECASE)
    REGEXEN['valid_url'] = re.compile('''
       
        (
            (https?:\/\/|www\.|bit\.ly)
            (%s)
            (/%s*%s?)?
            (\?%s*%s)?
        )
        ''' % (
           
            REGEXEN['valid_domain'].pattern,
            REGEXEN['valid_url_path_chars'].pattern,
            REGEXEN['valid_url_path_ending_chars'].pattern,
            REGEXEN['valid_url_query_chars'].pattern,
            REGEXEN['valid_url_query_ending_chars'].pattern
        ),
    re.IGNORECASE + re.X)
    strng = REGEXEN['valid_url'].search(strng)
    if strng:
        print("The url is {} ".format(strng.group(0)))
    else:
        print("None \n")
    return strng


 def extractUrl(text, match):
    pretld, posttld = None, None
    url = ""

    tld = match[1]
    startpt, endpt = match[0][0], match[0][1]

    # check the next character is valid
    if len(text) > endpt:
        nextcharacter = text[endpt]
        if re.match("[a-z0-9-.]", nextcharacter):
            return None

        posttld = re.match(':?[0-9]*[/[!#$&-;=?a-z]+]?', text[endpt:])
        pretld = re.search('[a-z0-9-.]+?$', text[:startpt])

    if pretld:
        url = pretld.group(0)
        startpt -= len(pretld.group(0))
        url += tld
        if posttld:
            url += posttld.group(0)
            endpt += len(posttld.group(0))

    # if it ends with a . or , strip it because it's probably unintentional
    url = url.rstrip(",.")

    return (startpt, endpt), url


 def url_ok(url):
    try:
        r = requests.head(url)
        return r.status_code == 200
    except:
        return False

 def parse_file(url_file):
    infile = open(url_file, 'r')
    lines = infile.readlines()
    for line in lines:
        print(line)


 if __name__ == '__main__':

    with open('12.txt', 'r') as fo:
        for line in fo:
            #results = urlextractor.parseText(line)
            results = regex_url(line)
            if results:
                url_ok(results.group(0))
                print(results.group(0) + "-----")
	# -- coding: utf-8 --
	"""
	Created on Thu Aug 11 15:55:44 2016

	@author: Cat
	"""

	#!/usr/bin/python
	# -- coding: UTF-8 --
	import requests
	#import urlextractor
	import re
	import string


	# utils.py


	def regex_url(strng):
	print("Start:" + strng + "inputting...\n")

	REGEXEN = {} # :nodoc:

	# URL related hash regex collection
	REGEXEN['valid_preceding_chars'] = re.compile("(?:[^\/\"':!=]\|^\|\:)")
	punct = re.escape(string.punctuation)
	REGEXEN['valid_domain'] = re.compile('(?:[^%s\s][\.-](?=[^%s\s])\|[^%s\s]){1,}\.[a-z]{2,}(?::[0-9]+)?' % (punct, punct, punct), re.IGNORECASE)
	REGEXEN['valid_url_path_chars'] = re.compile('[\.\,]?[a-z0-9!\*\'\(\);:=\+\$\/%#\[\]\-_,~@\.]', re.IGNORECASE)
	# Valid end-of-path chracters (so /foo. does not gobble the period).
	# 1. Allow ) for Wikipedia URLs.
	# 2. Allow =&# for empty URL parameters and other URL-join artifacts
	REGEXEN['valid_url_path_ending_chars'] = re.compile('[a-z0-9\)=#\/]', re.IGNORECASE)
	REGEXEN['valid_url_query_chars'] = re.compile('[a-z0-9!\*\'\(\);:&=\+\$\/%#\[\]\-_\.,~]', re.IGNORECASE)
	REGEXEN['valid_url_query_ending_chars'] = re.compile('[a-z0-9_&=#]', re.IGNORECASE)
	REGEXEN['valid_url'] = re.compile('''

	(
	(https?:\/\/\|www\.\|bit\.ly)
	(%s)
	(/%s*%s?)?
	(\?%s*%s)?
	)
	''' % (

	REGEXEN['valid_domain'].pattern,
	REGEXEN['valid_url_path_chars'].pattern,
	REGEXEN['valid_url_path_ending_chars'].pattern,
	REGEXEN['valid_url_query_chars'].pattern,
	REGEXEN['valid_url_query_ending_chars'].pattern
	),
	re.IGNORECASE + re.X)
	strng = REGEXEN['valid_url'].search(strng)
	if strng:
	print("The url is {} ".format(strng.group(0)))
	else:
	print("None \n")
	return strng


	def extractUrl(text, match):
	pretld, posttld = None, None
	url = ""

	tld = match[1]
	startpt, endpt = match[0][0], match[0][1]

	# check the next character is valid
	if len(text) > endpt:
	nextcharacter = text[endpt]
	if re.match("[a-z0-9-.]", nextcharacter):
	return None

	posttld = re.match(':?[0-9]*[/[!#$&-;=?a-z]+]?', text[endpt:])
	pretld = re.search('[a-z0-9-.]+?$', text[:startpt])

	if pretld:
	url = pretld.group(0)
	startpt -= len(pretld.group(0))
	url += tld
	if posttld:
	url += posttld.group(0)
	endpt += len(posttld.group(0))

	# if it ends with a . or , strip it because it's probably unintentional
	url = url.rstrip(",.")

	return (startpt, endpt), url


	def url_ok(url):
	try:
	r = requests.head(url)
	return r.status_code == 200
	except:
	return False

	def parse_file(url_file):
	infile = open(url_file, 'r')
	lines = infile.readlines()
	for line in lines:
	print(line)


	if __name__ == '__main__':

	with open('12.txt', 'r') as fo:
	for line in fo:
	#results = urlextractor.parseText(line)
	results = regex_url(line)
	if results:
	url_ok(results.group(0))
	print(results.group(0) + "-----")