Skip to content

Instantly share code, notes, and snippets.

@iamzoltan
Last active July 15, 2021 21:38
Show Gist options
  • Save iamzoltan/0390fe271cbf75eb1135174dbc0d66be to your computer and use it in GitHub Desktop.
Save iamzoltan/0390fe271cbf75eb1135174dbc0d66be to your computer and use it in GitHub Desktop.
basic scrapping
import httpx
import feedparser
from bs4 import BeautifulSoup
from sqlalchemy.exc import IntegrityError
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, DateTime
# create db engine
engine = create_engine('sqlite:///crypto_articles.db', echo = True)
meta = MetaData()
articles = Table(
'articles', meta,
Column('id', Integer, primary_key = True),
Column('link', String, unique = True),
Column('datetime', String),
Column('title', String),
Column('text', String),
)
meta.create_all(engine)
# parse links from the aggregated rss page
url = 'https://nitter.dark.fail/CryptoNewswire/rss'
page = httpx.get(url)
soup = BeautifulSoup(page.text, 'lxml')
links = soup.find_all('a')
filtered_links = []
for link in links:
href = link['href']
if href == 'http://Shopping.io':
continue
if not 'nitter' in href:
filtered_links.append(href)
d = feedparser.parse(url)
assert len(filtered_links) == len(d.entries)
# parse page for actual article
for i, link in enumerate(filtered_links):
output = ''
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
"Accept-Encoding": "gzip, deflate",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en"
}
headers = {'User-Agent': 'Nintendo 64 8-bit browser'}
site = httpx.get(link)
soup = BeautifulSoup(site.text, 'html.parser')
date = d.entries[i].published
link = soup.center.a['href']
output += link + '\n'
output += date + '\n'
site2 = httpx.get(link, headers=headers)
soup2 = BeautifulSoup(site2.text, 'lxml')
text = soup2.find_all(text=True)
#print(set([t.parent.name for t in text]))
blacklist = [
'b',
'script',
#'span',
#'li',
'h4',
'noscript',
'em',
'header',
'button',
'time',
'nav',
'strong',
'html',
#'p',
'h2',
'style',
'ul',
'main',
'form',
'ins',
'h6',
'head',
#'title',
'aside',
'body',
'[document]',
'h1',
'article',
'div',
'a'
]
content = ''
for t in text:
if t.parent.name not in blacklist:
if t.parent.name == 'title':
title = t
output += f'{t} \n'
else:
content += f'{t} '
output += f'{t} '
#print(output)
#print(content)
#print('\n')
try:
ins = articles.insert().values(link = link, datetime = date, title = title, text = content)
conn = engine.connect()
result = conn.execute(ins)
# Check for error raised by duplicate links, end run
except IntegrityError:
#print("\nensquanche - non-unique link, exiting")
exit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment