Skip to content

Instantly share code, notes, and snippets.

@bowbahdoe
Created August 20, 2016 01:19
Show Gist options
  • Select an option

  • Save bowbahdoe/dced23d4e9ad23fb5679841749fb9f9b to your computer and use it in GitHub Desktop.

Select an option

Save bowbahdoe/dced23d4e9ad23fb5679841749fb9f9b to your computer and use it in GitHub Desktop.
import re
import requests
import phonenumbers as pn
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client.scraped_data
collection = db.phone_numbers
def find_numbers (text):
possible_numbers = re.findall(rb"(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})",text)
real_numbers = []
for number in possible_numbers:
number_object = pn.parse(str(number), 'US')
if pn.is_valid_number(number_object) and pn.is_possible_number(number_object):
real_numbers.append(number)
return real_numbers
def get_links (html):
soup = BeautifulSoup (html, 'html.parser')
links = []
for link in soup.find_all('a'):
links.append(link.get ('href'))
links = list(set(links))
return links
def make_entry(phone_number, url):
entry = {'url': url,
'phone_number': phone_number}
return entry
def maine():
start_url = "http://stackoverflow.com/questions/3868753/find-phone-numbers-in-python-script"
phone_numbers = []
start_webpage = (requests.get (start_url)).content
level_one_numbers = find_numbers (start_webpage)
for number in level_one_numbers:
phone_numbers.append (make_entry(number, start_url))
links = get_links (start_webpage)
for link in links:
try:
webpage2 = (requests.get (link)).content
except RequestException:
continue
level_two_numbers = find_numbers (webpage2)
for number in level_two_numbers:
phone_numbers.append (make_entry(number, link))
collection.insert_many (phone_numbers)
if __name__ == '__main__':
maine()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment