Skip to content

Instantly share code, notes, and snippets.

@cydu
Last active August 29, 2015 14:09
Show Gist options
  • Save cydu/40d67beecaf0cf2a4cc8 to your computer and use it in GitHub Desktop.
Save cydu/40d67beecaf0cf2a4cc8 to your computer and use it in GitHub Desktop.
Merge a class of geeksforgeeks.com posts into one html, then convert into PDF
import sys
import requests
from xhtml2pdf import pisa
from bs4 import BeautifulSoup
def convertHtmlToPdf(sourceHtml, outputFilename):
print "generate pdf to " + outputFilename
resultFile = open(outputFilename, "w+b")
pisaStatus = pisa.CreatePDF(
sourceHtml,
dest=resultFile,
encoding='utf-8')
resultFile.close()
return pisaStatus.err
def fetchPosts(url):
posts = []
soup = BeautifulSoup(requests.get(url).content)
for post in soup.find_all(class_='post-title'):
posts.append((post.a['href'], post.a['title']))
for page in soup.find_all(class_='page larger'):
psoup = BeautifulSoup(requests.get(page['href']).content)
for post in psoup.find_all(class_='post-title'):
posts.append((post.a['href'], post.a['title']))
return posts
def postsToPdf(seed_url):
name = seed_url.split('/')[-2]
html = "<h1>" + seed_url + "</h1><br>"
posts = fetchPosts(seed_url)
print "total post number: " + str(len(posts))
for post in posts:
html += " <br> <a href=\"" + post[0] + "\" target=\"_blank\">" + post[1] + "</a>"
for post in posts:
print "fetch post: " + post[0]
try:
content = BeautifulSoup(requests.get(post[0]).content)
html += " <br> " + content.find(class_='post-info').prettify()
except:
print "Skip Post: " + post[0] + " " + post[1]
f = open(name + ".html", "w")
f.write(html.encode('utf-8'))
f.close()
convertHtmlToPdf(html, name + ".pdf")
if __name__=="__main__":
seed_urls = [
'http://www.geeksforgeeks.org/tag/dynamic-programming/',
'http://www.geeksforgeeks.org/tag/advance-data-structures/',
'http://www.geeksforgeeks.org/tag/Greedy-Algorithm/',
'http://www.geeksforgeeks.org/tag/backtracking/',
'http://www.geeksforgeeks.org/tag/pattern-searching/',
'http://www.geeksforgeeks.org/tag/divide-and-conquer/',
'http://www.geeksforgeeks.org/tag/MathematicalAlgo/',
'http://www.geeksforgeeks.org/tag/recursion/',
'http://www.geeksforgeeks.org/tag/geometric-algorithms/',
'http://www.geeksforgeeks.org/category/graph/',
'http://www.geeksforgeeks.org/category/tree/',
'http://www.geeksforgeeks.org/category/multiple-choice-question/',
'http://www.geeksforgeeks.org/category/bit-magic/',
'http://www.geeksforgeeks.org/category/c-strings/',
]
if len(sys.argv) > 1:
seed_urls = [sys.argv[1]]
for seed_url in seed_urls:
postsToPdf(seed_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment