Skip to content

Instantly share code, notes, and snippets.

@ejmurray
Created August 3, 2015 14:58
Show Gist options
  • Save ejmurray/5fdc3f2f74ccff9cd46a to your computer and use it in GitHub Desktop.
Save ejmurray/5fdc3f2f74ccff9cd46a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# there are a few erors where the script cannot return the correct number of lines
# and the problem with utf-8 encoding
# could use the code in scrapePubMed2.py to sort out the encoding issue?
__author__ = 'Ernest'
from bs4 import BeautifulSoup
import urllib2
import re
url = "http://dmd.aspetjournals.org/content/43/3.toc"
page = urllib2.urlopen(url)
# open the html file that you have downloaded and say that the encoding is lxml
soup_page = BeautifulSoup(page, "lxml", from_encoding="utf-8")
# print(soup_page.prettify())
# create a new file to save the results
f = open("dmdTitles2.txt", "w")
# f.writerow(["Title"]) # writes the first row with the heading "Title"
# get the headings of the titles
title_entries = soup_page.find_all("h4")
# pdf_links_regex = re.compile(".*/content/")
# first_link = soup_page.find(text=pdf_links_regex)
# for fl in first_link:
# print(fl.string)
# add each new line to the new text file using a loop
for title in title_entries:
# ' '.join(title())
print(title.string)
# f.write(str(title.string))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment