Created
July 25, 2014 05:31
-
-
Save anonymous/ea17299fcd86d8af2be5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
import re | |
import json | |
from bs4 import BeautifulSoup as bs | |
from urllib2 import urlopen | |
__authors__ = ['haxwithaxe [email protected]'] | |
__license__ = 'GPLv3' | |
_thanks = ''' Special thanks to Ukraine Calling (http://www.ukrainecalling.com) for assembling the list that this script uses as a data source. ''' | |
_source = 'http://www.ukrainecalling.com/email-to-text.aspx' | |
class EmailToTxt: | |
def __init__(self, url = _source): | |
self.url = url | |
self.soup = None | |
self.countries = {} | |
self.country = None | |
self.scrape() | |
self.parse() | |
def json(self, country=None): | |
''' dumps scraped data as json | |
@param country optional country name to dump only the values for that country | |
@returns json string | |
@throws IndexError if the country does not exist. | |
''' | |
if country: | |
data = self.countries.get(country.lower()) | |
else: | |
data = self.countries | |
data = {'credits': _thanks, 'source': self.url, 'data':data} | |
return json.dumps(data) | |
def __str__(self): | |
return self.json() | |
def scrape(self): | |
self.soup = bs(urlopen(self.url).read()) | |
def parse(self): | |
for row in self.soup.find('table', class_='tblcn').find_all('tr'): | |
self.parse_row(row) | |
def parse_row(self, row): | |
if row.get('class'): | |
self._add_country(row) | |
elif row.get('itemprop'): | |
self._add_gateway(row) | |
def _add_country(self, row): | |
self.country = row.find('h3').text.lower() | |
self.countries[self.country] = [] | |
def _add_gateway(self, row): | |
self.countries[self.country].append(self._get_entry(row)) | |
def _get_entry(self, row): | |
items = [self._provider, self._gateway, self._notes] | |
entry = {'provider':None, 'gateway_raw':None, 'gateway':None, 'notes':None, 'number_format':None} | |
col = 0 | |
for c in row.find_all('td'): | |
items[col](entry, c) | |
col+=1 | |
return entry | |
def _provider(self, entry, item): | |
entry['provider'] = item.text | |
def _gateway(self, entry, item): | |
fmt, digits = self._gateway_format(item) | |
entry['gateway'] = fmt | |
entry['number_format'] = '%s digit number' % digits | |
def _gateway_format(self, col): | |
prefix = number = gateway = '' | |
if len(col.contents) == 2: | |
number = col.contents[0].text | |
gateway = col.contents[1] | |
elif len(col.contents) == 3: | |
prefix = col.contents[0] | |
number = col.contents[1].text | |
gateway = col.contents[2] | |
fmt = '%s%%(number)s@%s' % (prefix, gateway) | |
digits = str(len(number)) | |
return fmt, digits | |
def _notes(self, entry, item): | |
entry['notes'] = item.text | |
if __name__ == '__main__': | |
dbo = EmailToTxt() | |
print(dbo) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment