iamzoltan · July 15, 2021 21:38
diff --git a/pussthescrapper.py b/pussthescrapper.py
 import httpx
 import feedparser
 from bs4 import BeautifulSoup
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, DateTime


 # create db engine
 engine = create_engine('sqlite:///crypto_articles.db', echo = True)
 meta = MetaData()


 articles = Table(
    'articles', meta,
    Column('id', Integer, primary_key = True),
    Column('link', String, unique = True),
    Column('datetime', String), 
    Column('title', String),
    Column('text', String),
 )


 meta.create_all(engine)


 # parse links from the aggregated rss page
 url = 'https://nitter.dark.fail/CryptoNewswire/rss'
 page = httpx.get(url)
 soup = BeautifulSoup(page.text, 'lxml')
 links = soup.find_all('a')
 filtered_links = []
 for link in links:
    href = link['href']
    if href == 'http://Shopping.io':
        continue
    if not 'nitter' in href:
        filtered_links.append(href)
 d = feedparser.parse(url)
 assert len(filtered_links) == len(d.entries)


 # parse page for actual article
 for i, link in enumerate(filtered_links):
    output = ''
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
        "Accept-Encoding": "gzip, deflate",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Language": "en"
    }
    headers = {'User-Agent': 'Nintendo 64 8-bit browser'}
    site = httpx.get(link)
    soup = BeautifulSoup(site.text, 'html.parser')
    date = d.entries[i].published
    link = soup.center.a['href']
    output += link + '\n'
    output += date + '\n'
    site2 = httpx.get(link, headers=headers)
    soup2 = BeautifulSoup(site2.text, 'lxml')
    text = soup2.find_all(text=True)
    #print(set([t.parent.name for t in text]))

    blacklist = [
        'b', 
        'script', 
        #'span',
        #'li',
        'h4',
        'noscript',
        'em',
        'header',
        'button',
        'time',
        'nav',
        'strong',
        'html',
        #'p',
        'h2',
        'style',
        'ul',
        'main',
        'form',
        'ins',
        'h6',
        'head',
        #'title',
        'aside',
        'body',
        '[document]',
        'h1',
        'article',
        'div',
        'a'
    ]

    content = ''
    for t in text:
        if t.parent.name not in blacklist:
            if t.parent.name == 'title':
                title = t
                output += f'{t} \n'
            else:
                content += f'{t} '
                output += f'{t} '
    #print(output)
    #print(content)
    #print('\n')
    try:
        ins = articles.insert().values(link = link, datetime = date, title = title, text = content)
        conn = engine.connect()
        result = conn.execute(ins)
    # Check for error raised by duplicate links, end run
    except IntegrityError:
        #print("\nensquanche - non-unique link, exiting")
        exit()
	import httpx
	import feedparser
	from bs4 import BeautifulSoup
	from sqlalchemy.exc import IntegrityError
	from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String, DateTime


	# create db engine
	engine = create_engine('sqlite:///crypto_articles.db', echo = True)
	meta = MetaData()


	articles = Table(
	'articles', meta,
	Column('id', Integer, primary_key = True),
	Column('link', String, unique = True),
	Column('datetime', String),
	Column('title', String),
	Column('text', String),
	)


	meta.create_all(engine)


	# parse links from the aggregated rss page
	url = 'https://nitter.dark.fail/CryptoNewswire/rss'
	page = httpx.get(url)
	soup = BeautifulSoup(page.text, 'lxml')
	links = soup.find_all('a')
	filtered_links = []
	for link in links:
	href = link['href']
	if href == 'http://Shopping.io':
	continue
	if not 'nitter' in href:
	filtered_links.append(href)
	d = feedparser.parse(url)
	assert len(filtered_links) == len(d.entries)


	# parse page for actual article
	for i, link in enumerate(filtered_links):
	output = ''
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
	"Accept-Encoding": "gzip, deflate",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8",
	"Accept-Language": "en"
	}
	headers = {'User-Agent': 'Nintendo 64 8-bit browser'}
	site = httpx.get(link)
	soup = BeautifulSoup(site.text, 'html.parser')
	date = d.entries[i].published
	link = soup.center.a['href']
	output += link + '\n'
	output += date + '\n'
	site2 = httpx.get(link, headers=headers)
	soup2 = BeautifulSoup(site2.text, 'lxml')
	text = soup2.find_all(text=True)
	#print(set([t.parent.name for t in text]))

	blacklist = [
	'b',
	'script',
	#'span',
	#'li',
	'h4',
	'noscript',
	'em',
	'header',
	'button',
	'time',
	'nav',
	'strong',
	'html',
	#'p',
	'h2',
	'style',
	'ul',
	'main',
	'form',
	'ins',
	'h6',
	'head',
	#'title',
	'aside',
	'body',
	'[document]',
	'h1',
	'article',
	'div',
	'a'
	]

	content = ''
	for t in text:
	if t.parent.name not in blacklist:
	if t.parent.name == 'title':
	title = t
	output += f'{t} \n'
	else:
	content += f'{t} '
	output += f'{t} '
	#print(output)
	#print(content)
	#print('\n')
	try:
	ins = articles.insert().values(link = link, datetime = date, title = title, text = content)
	conn = engine.connect()
	result = conn.execute(ins)
	# Check for error raised by duplicate links, end run
	except IntegrityError:
	#print("\nensquanche - non-unique link, exiting")
	exit()