Skip to content

Instantly share code, notes, and snippets.

@uliang
Last active December 11, 2022 14:08
Show Gist options
  • Save uliang/c22b0d505c2ee30c61e1c3412a7da328 to your computer and use it in GitHub Desktop.
Save uliang/c22b0d505c2ee30c61e1c3412a7da328 to your computer and use it in GitHub Desktop.
Sample Web Scraping function
import re
import requests
from bs4 import BeautifulSoup
def scrape(max_visits=10_000):
seed, articles, visited = ['www.malaymail.com'], [], []
for _ in range(max_visits):
if not seed:
break
link = seed.pop(0)
url = urllib.parse.urlparse(link)._replace(scheme='https', netloc='www.malaymail.com').geturl()
if url in visited:
continue
try:
response = requests.get(url, timeout=5)
except:
continue
finally:
visited.append(url)
soup = BeautifulSoup(response.content)
seed.extend([a['href'] for a in soup.select('a') if re.search(r'news/\w+', a['href']) is not None])
if 'www.malaymail.com/news' in url:
elem = soup.find("div", class_="article-body")
if elem:
article = elem.get_text()
articles.append(article)
return articles, visited
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment