Skip to content

Instantly share code, notes, and snippets.

@shahinism
Created June 19, 2019 12:51
Show Gist options
  • Save shahinism/f3d0034c5b264dddc7d0af84617927eb to your computer and use it in GitHub Desktop.
Save shahinism/f3d0034c5b264dddc7d0af84617927eb to your computer and use it in GitHub Desktop.
A simple script to extract codinghorror's content as text
atoma==0.0.15
attrs==19.1.0
backcall==0.1.0
certifi==2019.6.16
chardet==3.0.4
cssselect==1.0.3
decorator==4.4.0
defusedxml==0.6.0
entrypoints==0.3
fancycompleter==0.8
flake8==3.7.7
idna==2.8
ipython==7.5.0
ipython-genutils==0.2.0
jedi==0.13.3
logzero==1.5.0
lxml==4.3.4
mccabe==0.6.1
parso==0.4.0
pdbpp==0.10.0
pexpect==4.7.0
pickleshare==0.7.5
prompt-toolkit==2.0.9
ptyprocess==0.6.0
pycodestyle==2.5.0
pyflakes==2.1.1
Pygments==2.4.2
pyquery==1.4.0
python-dateutil==2.8.0
requests==2.22.0
six==1.12.0
traitlets==4.3.2
urllib3==1.25.3
wcwidth==0.1.7
wmctrl==0.3
import requests
import atoma
from urllib.parse import urljoin
from logzero import logger
from pyquery import PyQuery as pq
class Ghost:
def __init__(self, feed_url):
self.feed_url = feed_url
def get_feed_page(self, page_number):
url = urljoin(self.feed_url, str(page_number))
logger.info(f'getting {url}')
response = requests.get(url)
if not response.status_code == 200:
return
parsed_feed = atoma.parse_rss_bytes(response.content)
return parsed_feed
def get_all_feed_items(self, page=1):
items = []
while True:
logger.info(f'downloading page {page}...')
parsed_feed = self.get_feed_page(page)
if not parsed_feed:
break
items += parsed_feed.items
unique_urls = list(set(i.link for i in items))
logger.info(f'total items: {len(unique_urls)}')
page += 1
return items
def clean_html(html):
query = pq(html)
query('code').remove()
return query
feed_url = 'https://blog.codinghorror.com/rss/'
ghost = Ghost(feed_url)
response = ghost.get_all_feed_items()
content = ''
for item in response:
cleaned_html = clean_html(item.content_encoded)
body = cleaned_html
content += f'{item.title}\n{body.text()}\n\n'
with open('codinghorror.txt', 'w') as fp:
fp.write(content)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment