Skip to content

Instantly share code, notes, and snippets.

@GiedriusS
Created March 2, 2018 15:00
Show Gist options
  • Save GiedriusS/4ba076a7829b8a4c670fa1a63ce77ef1 to your computer and use it in GitHub Desktop.
Save GiedriusS/4ba076a7829b8a4c670fa1a63ce77ef1 to your computer and use it in GitHub Desktop.
import requests
import lxml.html
from lxml.cssselect import CSSSelector
import re
import requests
def check_url(url):
try:
return requests.head(url, timeout=1).status_code == 200
except:
return False
def date_parser(txt):
if txt is None:
return 'N/A'
day_fixer = lambda txt: '0'+txt if len(txt) == 1 else txt
matches = re.match(r'.*201(5|6|7|8) m\. (sausio|vasario|kovo|balandžio|gegužės|birželio|liepos|rugpjūčio|rugsėjo|spalio|lapkričio|gruodžio|lapkrič%cio) (\d{1,2}) d\..*' % 0x8d, txt, re.I|re.M)
if matches is None:
return 'N/A'
dates = {'sausio': '01', 'vasario': '02', 'kovo': '03', 'balandžio': '04', 'gegužės': '05',
'birželio': '06', 'liepos': '07', 'rugpjūčio': '08', 'rugsėjo': '09',
'spalio': '10', 'lapkričio': '11', 'gruodžio': '12', 'lapkrič%cio' % 0x8d : '11'}
return '201'+matches.group(1)+'-'+dates[matches.group(2).lower()]+'-'+day_fixer(matches.group(3))
def get_all(el):
if el is None:
return 'Unknown'
if len(el) != 0:
ret = ''
for subel in el:
ret = ret + str(subel.text) + ' '
return ret
else:
return el.text
list_url = 'http://www.lpt.lt/nelegalios-losimu-veiklos-vykdytojai/'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
resp = requests.get(list_url, headers=headers)
tree = lxml.html.fromstring(resp.text)
sel = CSSSelector('table[border="1"]>tbody>tr')
trs = sel(tree)[2:]
print('url,category_code,category_description,date_added,source,notes')
for tr in trs:
url = get_all(tr[2]).strip()
date = get_all(tr[4])
datestr = date_parser(date)
for u in url.split():
w_https = 'https://'+u
if check_url(w_https):
u = w_https
else:
u = 'http://'+u
print(u, 'GMB', 'Gambling', datestr, list_url, 'blocked via local ISPs DNS', sep=',')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment